Merge pull request #54 from FreeRDP/remotefx

Remotefx
author: Marc-André Moreau <marcandre.moreau@gmail.com> 2011-06-28 18:19:40 +0400
committer: Marc-André Moreau <marcandre.moreau@gmail.com> 2011-06-28 18:19:40 +0400
commit: 46c1d75a72521b548e8f0f677b86f3aab01027e8 (patch)
tree: 0a5f57437e5c7cfcfee3db4a71564bc4211a4b06
parent: b2e7d3410410a6e79e95ed6c3ba354fdd4e7ed05 (diff)
parent: c78f823e681a9e53cb8f89d78ab9bdf65405656c (diff)
58 files changed, 3264 insertions, 393 deletions
diff --git a/Makefile.am b/Makefile.am
index 5d63bc9..daa2a61 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,9 +1,8 @@
 ## Process this file with automake to produce Makefile.in
 REQUIRED_SUBDIRS = \
 	libfreerdp-asn1 \
-	libfreerdp-gdi \
-	libfreerdp-rfx/sse \
 	libfreerdp-rfx \
+	libfreerdp-gdi \
 	libfreerdp-utils \
 	libfreerdp-core \
 	docs \
diff --git a/X11/xf_win.c b/X11/xf_win.c
index ce1cc7f..3f6accf 100644
--- a/X11/xf_win.c
+++ b/X11/xf_win.c
@@ -1073,6 +1073,8 @@ RD_BOOL
 l_ui_check_certificate(rdpInst * inst, const char * fingerprint,
 	const char * subject, const char * issuer, RD_BOOL verified)
 {
+	//char answer;
+
 	printf("certificate details:\n");
 	printf("  Subject:\n    %s\n", subject);
 	printf("  Issued by:\n    %s\n", issuer);
@@ -1081,7 +1083,17 @@ l_ui_check_certificate(rdpInst * inst, const char * fingerprint,
 	if (!verified)
 		printf("The server could not be authenticated. Connection security may be compromised!\n");
 
+#if 0
+	printf("Accept this certificate? (Y/N): ");
+	answer = fgetc(stdin);
+
+	if (answer == 'y' || answer == 'Y')
+		return True;
+	else
+		return False;
+#else
 	return True;
+#endif
 }
 
 static int
diff --git a/configure.ac b/configure.ac
index 39e3863..6cf6e02 100644
--- a/configure.ac
+++ b/configure.ac
@@ -19,7 +19,9 @@ AH_TEMPLATE(IPv6, [IPv6])
 AH_TEMPLATE(NEED_ALIGN, [Alignment])
 AH_TEMPLATE(DISABLE_TLS, [Disable TLS encryption])
 AH_TEMPLATE(WITH_SSE, [Enable SSE Optimizations])
+AH_TEMPLATE(WITH_NEON, [Enable NEON Optimizations])
 AH_TEMPLATE(WITH_XKBFILE, [Use xkbfile for keyboard handling])
+AH_TEMPLATE(WITH_PROFILER, [Turn on the code profiler])
 AH_TEMPLATE(WITH_DEBUG, [Turn on debugging messages])
 AH_TEMPLATE(WITH_DEBUG_RDP, [Turn on debugging messages])
 AH_TEMPLATE(WITH_DEBUG_GDI, [Turn on debugging messages])
@@ -1026,24 +1028,52 @@ AC_ARG_WITH([cunit],
 )
 
 #
-# SSE
+# Profiler
 #
-sse="yes"
-AC_ARG_WITH([sse],
-	[AS_HELP_STRING([--with-sse], [Enable SSE Optimizations [default=yes]])])
-AS_IF([test "x$with_sse" == xno],
+profiler="no"
+AC_ARG_WITH([profiler],
+	[AS_HELP_STRING([--with-profiler], [enable the code profiler])])
+AS_IF([test "x$with_profiler" == xyes],
 	[
-		sse="no"
-		AM_CONDITIONAL(WITH_SSE, false)
+		profiler="yes"
+		AM_CONDITIONAL(WITH_PROFILER, true)
+		AC_DEFINE(WITH_PROFILER,1)
 	],
 	[
-		sse="yes"
+		profiler="no"
+		AM_CONDITIONAL(WITH_PROFILER, false)
+	]
+)
+
+#
+# SSE
+#
+AM_CONDITIONAL(WITH_SSE, false)
+AC_ARG_WITH(sse,
+    [  --with-sse  enable SSE optimizations],
+    [
+        if test $withval != "no";
+        then
 		AM_CONDITIONAL(WITH_SSE, true)
 		AC_DEFINE(WITH_SSE,1)
 		CFLAGS="$CFLAGS -msse2"
-		REQUIRED_SUBDIRS="libfreerdp-rfx/sse $REQUIRED_SUBDIRS"
-	]
-)
+        fi
+    ])
+
+#
+# NEON
+#
+AM_CONDITIONAL(WITH_NEON, false)
+AC_ARG_WITH(neon,
+    [  --with-neon  enable NEON optimizations],
+    [
+        if test $withval != "no";
+        then
+		AM_CONDITIONAL(WITH_NEON, true)
+		AC_DEFINE(WITH_NEON,1)
+		CFLAGS="$CFLAGS -mfpu=neon"
+        fi
+    ])
 
 #
 # X11
@@ -1139,9 +1169,12 @@ AC_CONFIG_FILES([
 Makefile
 freerdp.pc
 libfreerdp-asn1/Makefile
-libfreerdp-gdi/Makefile
 libfreerdp-rfx/Makefile
 libfreerdp-rfx/sse/Makefile
+libfreerdp-rfx/neon/Makefile
+libfreerdp-gdi/Makefile
+libfreerdp-gdi/sse/Makefile
+libfreerdp-gdi/neon/Makefile
 libfreerdp-utils/Makefile
 libfreerdp-core/Makefile
 docs/Makefile
diff --git a/cunit/test_librfx.c b/cunit/test_librfx.c
index 42913d4..78c453d 100644
--- a/cunit/test_librfx.c
+++ b/cunit/test_librfx.c
@@ -32,6 +32,7 @@
 #include "rfx_quantization.h"
 #include "rfx_dwt.h"
 #include "rfx_decode.h"
+#include "rfx_encode.h"
 
 #include "test_librfx.h"
 
@@ -60,7 +61,7 @@ static const uint8 y_data[] =
 
 static const uint8 cb_data[] =
 {
-		                                                  0x1b, 0x04, 0x7f, 0x04, 0x31, 0x5f, 0xc2,
+	                                                      0x1b, 0x04, 0x7f, 0x04, 0x31, 0x5f, 0xc2,
 	0x94, 0xaf, 0x05, 0x29, 0x5e, 0x0a, 0x52, 0xbc, 0x14, 0xa5, 0x78, 0x29, 0x25, 0x78, 0x29, 0x25,
 	0x78, 0x29, 0x25, 0x68, 0x52, 0x4a, 0xf0, 0x52, 0x4a, 0xf0, 0x52, 0x4a, 0xd0, 0xa4, 0x95, 0xe0,
 	0xa4, 0x95, 0xe0, 0xa4, 0x95, 0xa1, 0x49, 0x2b, 0xc1, 0x49, 0x2b, 0xc1, 0x49, 0x2b, 0x42, 0x92,
@@ -85,7 +86,7 @@ static const uint8 cb_data[] =
 
 static const uint8 cr_data[] =
 {
-		                                0x1b, 0xfc, 0x11, 0xc1, 0x0f, 0x4a, 0xc1, 0x4f, 0x4a, 0xc1,
+	                                    0x1b, 0xfc, 0x11, 0xc1, 0x0f, 0x4a, 0xc1, 0x4f, 0x4a, 0xc1,
 	0x4f, 0x4a, 0xa1, 0x4d, 0x95, 0x42, 0x9e, 0x95, 0x42, 0x9e, 0x95, 0x42, 0x9b, 0x2a, 0x85, 0x3d,
 	0x2a, 0x85, 0x3d, 0x2a, 0x85, 0x36, 0x55, 0x0a, 0x7a, 0x55, 0x0a, 0x7a, 0x55, 0x0a, 0x6c, 0xaa,
 	0x14, 0xf4, 0xaa, 0x14, 0xf4, 0xaa, 0x14, 0xd9, 0x54, 0x29, 0xe9, 0x54, 0x29, 0xe9, 0x54, 0x29,
@@ -114,6 +115,44 @@ static const unsigned int test_quantization_values[] =
 	6, 6, 6, 6, 7, 7, 8, 8, 8, 9
 };
 
+static const uint8 rgb_scanline_data[] =
+{
+	0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00,
+	0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00,
+	0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00,
+	0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00,
+	0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00,
+	0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00,
+	0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00,
+	0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00,
+	0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00,
+	0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00,
+	0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00,
+	0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF,
+	0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF,
+	0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF,
+	0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF,
+	0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF,
+
+	0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00,
+	0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00,
+	0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00,
+	0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00,
+	0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00,
+	0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00,
+	0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00,
+	0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00,
+	0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00,
+	0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00,
+	0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00,
+	0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF,
+	0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF,
+	0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF,
+	0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF,
+	0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF
+};
+
+static uint8 * rgb_data;
 
 int init_librfx_suite(void)
 {
@@ -130,11 +169,14 @@ int add_librfx_suite(void)
 	add_test_suite(librfx);
 
 	add_test_function(bitstream);
+	add_test_function(bitstream_enc);
 	add_test_function(rlgr);
 	add_test_function(differential);
 	add_test_function(quantization);
 	add_test_function(dwt);
 	add_test_function(decode);
+	add_test_function(encode);
+	add_test_function(message);
 
 	return 0;
 }
@@ -142,7 +184,7 @@ int add_librfx_suite(void)
 void
 test_bitstream(void)
 {
-	unsigned int b;
+	uint16 b;
 	RFX_BITSTREAM * bs;
 
 	bs = rfx_bitstream_new();
@@ -157,10 +199,33 @@ test_bitstream(void)
 	//printf("\n");
 }
 
-static unsigned int buffer[4096];
+void
+test_bitstream_enc(void)
+{
+	uint8 buffer[10];
+	RFX_BITSTREAM * bs;
+	int i;
+
+	bs = rfx_bitstream_new();
+	memset(buffer, 0, sizeof(buffer));
+	rfx_bitstream_put_buffer(bs, buffer, sizeof(buffer));
+	for (i = 0; i < 16; i++)
+	{
+		rfx_bitstream_put_bits(bs, i, 5);
+	}
+	/*for (i = 0; i < sizeof(buffer); i++)
+	{
+		printf("%X ", buffer[i]);
+	}*/
+	rfx_bitstream_free(bs);
+
+	//printf("\n");
+}
+
+static sint16 buffer[4096];
 
 void
-dump_buffer(int * buf, int n)
+dump_buffer(sint16 * buf, int n)
 {
 	int i;
 
@@ -168,7 +233,7 @@ dump_buffer(int * buf, int n)
 	{
 		if (i % 16 == 0)
 			printf("\n%04d ", i);
-		printf("% 3d ", buf[i]);
+		printf("% 4d ", buf[i]);
 	}
 	printf("\n");
 }
@@ -194,16 +259,7 @@ test_differential(void)
 void
 test_quantization(void)
 {
-	rfx_quantization_decode(buffer, 1024, test_quantization_values[0]); /* HL1 */
-	rfx_quantization_decode(buffer + 1024, 1024, test_quantization_values[1]); /* LH1 */
-	rfx_quantization_decode(buffer + 2048, 1024, test_quantization_values[2]); /* HH1 */
-	rfx_quantization_decode(buffer + 3072, 256, test_quantization_values[3]); /* HL2 */
-	rfx_quantization_decode(buffer + 3328, 256, test_quantization_values[4]); /* LH2 */
-	rfx_quantization_decode(buffer + 3584, 256, test_quantization_values[5]); /* HH2 */
-	rfx_quantization_decode(buffer + 3840, 64, test_quantization_values[6]); /* HL3 */
-	rfx_quantization_decode(buffer + 3904, 64, test_quantization_values[7]); /* LH3 */
-	rfx_quantization_decode(buffer + 3868, 64, test_quantization_values[8]); /* HH3 */
-	rfx_quantization_decode(buffer + 4032, 64, test_quantization_values[9]); /* LL3 */
+	rfx_quantization_decode(buffer, test_quantization_values);
 	//dump_buffer(buffer, 4096);
 }
 
@@ -213,13 +269,30 @@ test_dwt(void)
 	RFX_CONTEXT * context;
 
 	context = rfx_context_new();
-	rfx_dwt_2d_decode(context, (int*) buffer + 3840, 8);
-	rfx_dwt_2d_decode(context, (int*) buffer + 3072, 16);
-	rfx_dwt_2d_decode(context, (int*) buffer, 32);
+	rfx_dwt_2d_decode(buffer, context->dwt_buffer);
 	//dump_buffer(buffer, 4096);
 	rfx_context_free(context);
 }
 
+static void
+dump_ppm_image(uint8 * image_buf)
+{
+	/* Dump a .ppm image. */
+	static int frame_id = 0;
+	char buf[100];
+	FILE * fp;
+
+	snprintf(buf, sizeof(buf), "/tmp/FreeRDP_Frame_%d.ppm", frame_id);
+	fp = fopen(buf, "wb");
+	fwrite("P6\n", 1, 3, fp);
+	fwrite("64 64\n", 1, 6, fp);
+	fwrite("255\n", 1, 4, fp);
+	fwrite(image_buf, 1, 4096 * 3, fp);
+	fflush(fp);
+	fclose(fp);
+	frame_id++;
+}
+
 void
 test_decode(void)
 {
@@ -236,20 +309,91 @@ test_decode(void)
 		decode_buffer);
 	rfx_context_free(context);
 
-	/* Dump a .ppm image. */
-	static int frame_id = 0;
-	char buf[100];
-	FILE * fp;
+	dump_ppm_image(decode_buffer);
+}
 
-	snprintf(buf, sizeof(buf), "/tmp/FreeRDP_Frame_%d.ppm", frame_id);
-	fp = fopen(buf, "wb");
-	fwrite("P6\n", 1, 3, fp);
-	fwrite("64 64\n", 1, 6, fp);
-	fwrite("255\n", 1, 4, fp);
-	fwrite(decode_buffer, 1, 4096 * 3, fp);
-	fflush(fp);
-	fclose(fp);
-	frame_id++;
+void
+test_encode(void)
+{
+	RFX_CONTEXT * context;
+	uint8 ycbcr_buffer[1024000];
+	int y_size, cb_size, cr_size;
+	int i;
+	uint8 decode_buffer[4096 * 3];
+
+	rgb_data = (uint8 *) malloc(64 * 64 * 3);
+	for (i = 0; i < 64; i++)
+		memcpy(rgb_data + i * 64 * 3, rgb_scanline_data, 64 * 3);
+	//hexdump(rgb_data, 64 * 64 * 3);
+
+	context = rfx_context_new();
+	context->mode = RLGR3;
+	rfx_context_set_pixel_format(context, RFX_PIXEL_FORMAT_RGB);
+
+	rfx_encode_rgb(context, rgb_data, 64, 64, 64 * 3,
+		test_quantization_values, test_quantization_values, test_quantization_values,
+		ycbcr_buffer, sizeof(ycbcr_buffer), &y_size, &cb_size, &cr_size);
+	//dump_buffer(context->cb_g_buffer, 4096);
+
+	/*printf("*** Y ***\n");
+	hexdump(ycbcr_buffer, y_size);
+	printf("*** Cb ***\n");
+	hexdump(ycbcr_buffer + y_size, cb_size);
+	printf("*** Cr ***\n");
+	hexdump(ycbcr_buffer + y_size + cb_size, cr_size);*/
+
+	rfx_decode_rgb(context,
+		ycbcr_buffer, y_size, test_quantization_values,
+		ycbcr_buffer + y_size, cb_size, test_quantization_values,
+		ycbcr_buffer + y_size + cb_size, cr_size, test_quantization_values,
+		decode_buffer);
+	dump_ppm_image(decode_buffer);
+
+	rfx_context_free(context);
+	free(rgb_data);
 }
 
+void
+test_message(void)
+{
+	RFX_CONTEXT * context;
+	uint8 buffer[1024000];
+	int size;
+	int i, j;
+	RFX_RECT rect = {0, 0, 100, 80};
+	RFX_MESSAGE * message;
 
+	rgb_data = (uint8 *) malloc(100 * 80 * 3);
+	for (i = 0; i < 80; i++)
+		memcpy(rgb_data + i * 100 * 3, rgb_scanline_data, 100 * 3);
+
+	context = rfx_context_new();
+	context->mode = RLGR3;
+	context->width = 800;
+	context->height = 600;
+	rfx_context_set_pixel_format(context, RFX_PIXEL_FORMAT_RGB);
+
+	size = rfx_compose_message_header(context, buffer, sizeof(buffer));
+	/*hexdump(buffer, size);*/
+	message = rfx_process_message(context, buffer, size);
+	rfx_message_free(context, message);
+
+	for (i = 0; i < 1000; i++)
+	{
+		size = rfx_compose_message_data(context, buffer, sizeof(buffer),
+			&rect, 1, rgb_data, 100, 80, 100 * 3);
+		/*hexdump(buffer, size);*/
+		message = rfx_process_message(context, buffer, size);
+		if (i == 0)
+		{
+			for (j = 0; j < message->num_tiles; j++)
+			{
+				dump_ppm_image(message->tiles[j]->data);
+			}
+		}
+		rfx_message_free(context, message);
+	}
+
+	rfx_context_free(context);
+	free(rgb_data);
+}
diff --git a/cunit/test_librfx.h b/cunit/test_librfx.h
index 726146a..51d9bcf 100644
--- a/cunit/test_librfx.h
+++ b/cunit/test_librfx.h
@@ -26,6 +26,8 @@ int add_librfx_suite(void);
 void
 test_bitstream(void);
 void
+test_bitstream_enc(void);
+void
 test_rlgr(void);
 void
 test_differential(void);
@@ -35,4 +37,8 @@ void
 test_dwt(void);
 void
 test_decode(void);
+void
+test_encode(void);
+void
+test_message(void);
 
diff --git a/dfb/Makefile.am b/dfb/Makefile.am
index c65c5bc..42079ce 100644
--- a/dfb/Makefile.am
+++ b/dfb/Makefile.am
@@ -23,6 +23,7 @@ dfbfreerdp_LDADD = \
 	../libfreerdp-kbd/libfreerdp-kbd.la \
 	../libfreerdp-chanman/libfreerdp-chanman.la \
 	../libfreerdp-core/libfreerdp-core.la \
+	../libfreerdp-utils/libfreerdp-utils.la \
 	@DFB_LIBS@ -lfusion -ldirect -lz
 
 
diff --git a/dfb/dfb_win.c b/dfb/dfb_win.c
index 5dd7c89..ac92c57 100644
--- a/dfb/dfb_win.c
+++ b/dfb/dfb_win.c
@@ -568,12 +568,16 @@ dfb_post_connect(rdpInst * inst)
 }
 
 void
-dfb_uninit(void * dfb_info)
+dfb_uninit(rdpInst * inst)
 {
-	dfbInfo * dfbi;
-	dfbi = (dfbInfo *) dfb_info;
-	dfbi->primary->Release(dfbi->primary);
-	dfbi->dfb->Release(dfbi->dfb);
+	dfbInfo *dfbi = GET_DFBI(inst);
+
+	if (inst->settings->software_gdi == 1)
+	{
+		gdi_free(inst);
+		dfbi->primary->Release(dfbi->primary);
+		dfbi->dfb->Release(dfbi->dfb);
+	}
 }
 
 int
diff --git a/dfb/dfb_win.h b/dfb/dfb_win.h
index b33ca39..7dfcb05 100644
--- a/dfb/dfb_win.h
+++ b/dfb/dfb_win.h
@@ -29,7 +29,7 @@ dfb_pre_connect(rdpInst * inst);
 int
 dfb_post_connect(rdpInst * inst);
 void
-dfb_uninit(void * dfb_info);
+dfb_uninit(rdpInst * inst);
 int
 dfb_get_fds(rdpInst * inst, void ** read_fds, int * read_count,
 	void ** write_fds, int * write_count);
diff --git a/dfb/dfbfreerdp.c b/dfb/dfbfreerdp.c
index 275fe81..c207523 100644
--- a/dfb/dfbfreerdp.c
+++ b/dfb/dfbfreerdp.c
@@ -503,7 +503,6 @@ static int
 run_dfbfreerdp(rdpSet * settings, rdpChanMan * chan_man)
 {
 	rdpInst * inst;
-	void * dfb_info;
 	void * read_fds[32];
 	void * write_fds[32];
 	int read_count;
@@ -643,10 +642,9 @@ run_dfbfreerdp(rdpSet * settings, rdpChanMan * chan_man)
 		}
 	}
 	/* cleanup */
-	dfb_info = inst->param1;
 	inst->rdp_disconnect(inst);
+	dfb_uninit(inst);
 	freerdp_free(inst);
-	dfb_uninit(dfb_info);
 	return 0;
 }
 
diff --git a/include/freerdp/constants/ui.h b/include/freerdp/constants/ui.h
index 02c17e9..61db60c 100644
--- a/include/freerdp/constants/ui.h
+++ b/include/freerdp/constants/ui.h
@@ -54,7 +54,7 @@
 #define PERF_ENABLE_DESKTOP_COMPOSITION 0x00000100
 
 /* Surface Command */
-#define CMDTYPE_SET_SURFACE_BITS        0x0001
+#define CMDTYPE_SET_SURFACE_BITS	0x0001
 #define CMDTYPE_FRAME_MARKER            0x0004
 #define CMDTYPE_STREAM_SURFACE_BITS     0x0006
 
diff --git a/include/freerdp/rfx.h b/include/freerdp/rfx.h
index 346a26f..932f5f5 100644
--- a/include/freerdp/rfx.h
+++ b/include/freerdp/rfx.h
@@ -22,6 +22,8 @@
 
 #include "types/base.h"
 
+#include <freerdp/utils/profiler.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -39,9 +41,13 @@ extern "C" {
 #define WBT_FRAME_END		0xCCC5
 #define WBT_REGION		0xCCC6
 #define WBT_EXTENSION		0xCCC7
+#define CBT_REGION		0xCAC1
 #define CBT_TILESET		0xCAC2
 #define CBT_TILE		0xCAC3
 
+/* tileSize */
+#define CT_TILE_64x64		0x0040
+
 /* properties.flags */
 #define CODEC_MODE		0x02
 
@@ -120,7 +126,8 @@ typedef struct _RFX_MESSAGE RFX_MESSAGE;
 
 struct _RFX_CONTEXT
 {
-	int flags;
+	uint16 flags;
+	uint16 properties;
 	uint16 width;
 	uint16 height;
 	RLGR_MODE mode;
@@ -128,29 +135,58 @@ struct _RFX_CONTEXT
 	uint32 codec_id;
 	uint32 codec_version;
 	RFX_PIXEL_FORMAT pixel_format;
+	uint8 bytes_per_pixel;
 
 	/* temporary data within a frame */
+	uint32 frame_idx;
 	uint8 num_quants;
 	uint32 * quants;
+	uint8 quant_idx_y;
+	uint8 quant_idx_cb;
+	uint8 quant_idx_cr;
 
 	/* pre-allocated buffers */
 
 	RFX_POOL* pool; /* memory pool */
 
-	uint32 y_r_mem[4096+4]; /* 4096 = 64x64 (+ 4x4 = 16 for mem align) */
-	uint32 cb_g_mem[4096+4]; /* 4096 = 64x64 (+ 4x4 = 16 for mem align) */
-	uint32 cr_b_mem[4096+4]; /* 4096 = 64x64 (+ 4x4 = 16 for mem align) */
+	sint16 y_r_mem[4096+8]; /* 4096 = 64x64 (+ 8x2 = 16 for mem align) */
+	sint16 cb_g_mem[4096+8]; /* 4096 = 64x64 (+ 8x2 = 16 for mem align) */
+	sint16 cr_b_mem[4096+8]; /* 4096 = 64x64 (+ 8x2 = 16 for mem align) */
  
- 	uint32* y_r_buffer;
-	uint32* cb_g_buffer;
-	uint32* cr_b_buffer;
+ 	sint16 * y_r_buffer;
+	sint16 * cb_g_buffer;
+	sint16 * cr_b_buffer;
  
-	uint32 idwt_buffer_8[256]; /* sub-band width 8 */
-	uint32 idwt_buffer_16[1024]; /* sub-band width 16 */
-	uint32 idwt_buffer_32[4096]; /* sub-band width 32 */
-	uint32* idwt_buffers[5]; /* sub-band buffer array */
-	
-	void (* decode_YCbCr_to_RGB)(uint32 * y_r_buf, uint32 * cb_g_buf, uint32 * cr_b_buf);
+	sint16 dwt_mem[32*32*2*2 + 8]; /* maximum sub-band width is 32 */
+
+	sint16 * dwt_buffer;
+
+	/* routines */
+	void (* decode_YCbCr_to_RGB)(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf);
+	void (* encode_RGB_to_YCbCr)(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf);
+	void (* quantization_decode)(sint16 * buffer, const uint32 * quantization_values);
+	void (* quantization_encode)(sint16 * buffer, const uint32 * quantization_values);
+	void (* dwt_2d_decode)(sint16 * buffer, sint16 * dwt_buffer);
+	void (* dwt_2d_encode)(sint16 * buffer, sint16 * dwt_buffer);
+
+	/* profiler definitions */
+	PROFILER_DEFINE(prof_rfx_decode_rgb);
+	PROFILER_DEFINE(prof_rfx_decode_component);
+	PROFILER_DEFINE(prof_rfx_rlgr_decode);
+	PROFILER_DEFINE(prof_rfx_differential_decode);
+	PROFILER_DEFINE(prof_rfx_quantization_decode);
+	PROFILER_DEFINE(prof_rfx_dwt_2d_decode);
+	PROFILER_DEFINE(prof_rfx_decode_YCbCr_to_RGB);
+	PROFILER_DEFINE(prof_rfx_decode_format_RGB);
+
+	PROFILER_DEFINE(prof_rfx_encode_rgb);
+	PROFILER_DEFINE(prof_rfx_encode_component);
+	PROFILER_DEFINE(prof_rfx_rlgr_encode);
+	PROFILER_DEFINE(prof_rfx_differential_encode);
+	PROFILER_DEFINE(prof_rfx_quantization_encode);
+	PROFILER_DEFINE(prof_rfx_dwt_2d_encode);
+	PROFILER_DEFINE(prof_rfx_encode_RGB_to_YCbCr);
+	PROFILER_DEFINE(prof_rfx_encode_format_RGB);
 };
 typedef struct _RFX_CONTEXT RFX_CONTEXT;
 
@@ -158,9 +194,13 @@ RFX_CONTEXT* rfx_context_new(void);
 void rfx_context_free(RFX_CONTEXT * context);
 void rfx_context_set_pixel_format(RFX_CONTEXT * context, RFX_PIXEL_FORMAT pixel_format);
 
-RFX_MESSAGE* rfx_process_message(RFX_CONTEXT * context, uint8 * data, int data_size);
+RFX_MESSAGE* rfx_process_message(RFX_CONTEXT * context, uint8 * data, int size);
 void rfx_message_free(RFX_CONTEXT * context, RFX_MESSAGE * message);
 
+int rfx_compose_message_header(RFX_CONTEXT * context, uint8 * buffer, int buffer_size);
+int rfx_compose_message_data(RFX_CONTEXT * context, uint8 * buffer, int buffer_size,
+	const RFX_RECT * rects, int num_rects, uint8 * image_data, int width, int height, int rowstride);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/freerdp/utils/Makefile.am b/include/freerdp/utils/Makefile.am
index eec4969..410babf 100644
--- a/include/freerdp/utils/Makefile.am
+++ b/include/freerdp/utils/Makefile.am
@@ -6,7 +6,9 @@ include_HEADERS = \
 	chan_plugin.h \
 	datablob.h \
 	memory.h \
+	profiler.h \
 	semaphore.h \
+	stopwatch.h \
 	stream.h \
 	unicode.h \
 	wait_obj.h
diff --git a/include/freerdp/utils/profiler.h b/include/freerdp/utils/profiler.h
new file mode 100644
index 0000000..1a149c1
--- /dev/null
+++ b/include/freerdp/utils/profiler.h
@@ -0,0 +1,71 @@
+/*
+   FreeRDP: A Remote Desktop Protocol client.
+   Profiler Utils
+
+   Copyright 2011 Stephen Erisman
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#ifndef __UTILS_PROFILER_H
+#define __UTILS_PROFILER_H
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+
+#include <freerdp/utils/memory.h>
+#include <freerdp/utils/stopwatch.h>
+
+struct _PROFILER
+{
+	char *name;	
+	STOPWATCH *stopwatch;
+};
+typedef struct _PROFILER PROFILER;
+
+PROFILER * profiler_create(char * name);
+void profiler_free(PROFILER * profiler);
+
+void profiler_enter(PROFILER * profiler);
+void profiler_exit(PROFILER * profiler);
+
+void profiler_print_header();
+void profiler_print(PROFILER * profiler);
+void profiler_print_footer();
+
+#ifdef WITH_PROFILER
+#define IF_PROFILER(then)			then
+#define PROFILER_DEFINE(prof)		PROFILER * prof
+#define PROFILER_CREATE(prof,name)	prof = profiler_create(name)
+#define PROFILER_FREE(prof)			profiler_free(prof)
+#define PROFILER_ENTER(prof)		profiler_enter(prof)
+#define PROFILER_EXIT(prof)			profiler_exit(prof)
+#define PROFILER_PRINT_HEADER		profiler_print_header()
+#define PROFILER_PRINT(prof)		profiler_print(prof)
+#define PROFILER_PRINT_FOOTER		profiler_print_footer()
+#else
+#define IF_PROFILER(then)			;
+#define PROFILER_DEFINE(prof)		;
+#define PROFILER_CREATE(prof,name)	;
+#define PROFILER_FREE(prof)			;
+#define PROFILER_ENTER(prof)		;
+#define PROFILER_EXIT(prof)			;
+#define PROFILER_PRINT_HEADER		;
+#define PROFILER_PRINT(prof)		;
+#define PROFILER_PRINT_FOOTER		;
+#endif
+
+#endif /* __UTILS_PROFILER_H */
diff --git a/include/freerdp/utils/stopwatch.h b/include/freerdp/utils/stopwatch.h
new file mode 100644
index 0000000..02db7ea
--- /dev/null
+++ b/include/freerdp/utils/stopwatch.h
@@ -0,0 +1,44 @@
+/*
+   FreeRDP: A Remote Desktop Protocol client.
+   Stopwatch Utils
+
+   Copyright 2011 Stephen Erisman
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#ifndef __UTILS_STOPWATCH_H
+#define __UTILS_STOPWATCH_H
+
+#include <freerdp/utils/memory.h>
+#include <time.h>
+
+struct _STOPWATCH
+{
+	clock_t start;
+	clock_t end;
+	double elapsed;
+	clock_t count;
+};
+typedef struct _STOPWATCH STOPWATCH;
+
+STOPWATCH * stopwatch_create();
+void stopwatch_free(STOPWATCH * stopwatch);
+
+void stopwatch_start(STOPWATCH * stopwatch);
+void stopwatch_stop(STOPWATCH * stopwatch);
+void stopwatch_reset(STOPWATCH * stopwatch);
+
+double stopwatch_get_elapsed_time_in_seconds(STOPWATCH * stopwatch);
+
+#endif /* __UTILS_STOPWATCH_H */
diff --git a/libfreerdp-gdi/Makefile.am b/libfreerdp-gdi/Makefile.am
index bda50bc..2210079 100644
--- a/libfreerdp-gdi/Makefile.am
+++ b/libfreerdp-gdi/Makefile.am
@@ -21,6 +21,7 @@ libfreerdp_gdi_la_SOURCES = \
 	gdi_8bpp.c gdi_8bpp.h \
 	color.c color.h \
 	decode.c decode.h \
+	libgdi.h \
 	gdi.c gdi.h
 
 libfreerdp_gdi_la_CFLAGS = \
@@ -30,9 +31,21 @@ libfreerdp_gdi_la_CFLAGS = \
 
 libfreerdp_gdi_la_LDFLAGS =
 
-libfreerdp_gdi_la_LIBDADD = \
+libfreerdp_gdi_la_LIBADD = \
 	../libfreerdp-rfx/libfreerdp-rfx.la
 
+if WITH_SSE
+SUBDIRS = sse
+libfreerdp_gdi_la_CFLAGS += -I./sse
+libfreerdp_gdi_la_LIBADD += ./sse/libfreerdp-gdi-sse.la
+endif
+
+if WITH_NEON
+SUBDIRS = neon
+libfreerdp_gdi_la_CFLAGS += -I./neon
+libfreerdp_gdi_la_LIBADD += ./neon/libfreerdp-gdi-neon.la
+endif
+
 # extra
 EXTRA_DIST =
 
diff --git a/libfreerdp-gdi/color.c b/libfreerdp-gdi/color.c
index 371f69a..76f72e9 100644
--- a/libfreerdp-gdi/color.c
+++ b/libfreerdp-gdi/color.c
@@ -618,7 +618,7 @@ uint8* gdi_image_convert_32bpp(uint8* srcData, uint8* dstData, int width, int he
 	return srcData;
 }
 
-p_gdi_image_convert gdi_image_convert_[5] =
+p_gdi_image_convert_bpp gdi_image_convert_[5] =
 {
 	NULL,
 	gdi_image_convert_8bpp,
@@ -629,7 +629,7 @@ p_gdi_image_convert gdi_image_convert_[5] =
 
 uint8* gdi_image_convert(uint8* srcData, uint8* dstData, int width, int height, int srcBpp, int dstBpp, HCLRCONV clrconv)
 {
-	p_gdi_image_convert _p_gdi_image_convert = gdi_image_convert_[IBPP(srcBpp)];
+	p_gdi_image_convert_bpp _p_gdi_image_convert = gdi_image_convert_[IBPP(srcBpp)];
 
 	if (_p_gdi_image_convert != NULL)
 		return _p_gdi_image_convert(srcData, dstData, width, height, srcBpp, dstBpp, clrconv);
diff --git a/libfreerdp-gdi/color.h b/libfreerdp-gdi/color.h
index 2e116c1..14e4f1e 100644
--- a/libfreerdp-gdi/color.h
+++ b/libfreerdp-gdi/color.h
@@ -233,7 +233,7 @@ typedef CLRCONV* HCLRCONV;
 
 #define IBPP(_bpp) (((_bpp + 1)/ 8) % 5)
 
-typedef uint8* (*p_gdi_image_convert)(uint8* srcData, uint8* dstData, int width, int height, int srcBpp, int dstBpp, HCLRCONV clrconv);
+typedef uint8* (*p_gdi_image_convert_bpp)(uint8* srcData, uint8* dstData, int width, int height, int srcBpp, int dstBpp, HCLRCONV clrconv);
 
 int gdi_get_pixel(uint8 * data, int x, int y, int width, int height, int bpp);
 void gdi_set_pixel(uint8* data, int x, int y, int width, int height, int bpp, int pixel);
@@ -244,6 +244,8 @@ uint8* gdi_mono_image_convert(uint8* srcData, int width, int height, int srcBpp,
 int gdi_mono_cursor_convert(uint8* srcData, uint8* maskData, uint8* xorMask, uint8* andMask, int width, int height, int bpp, HCLRCONV clrconv);
 int gdi_alpha_cursor_convert(uint8* alphaData, uint8* xorMask, uint8* andMask, int width, int height, int bpp, HCLRCONV clrconv);
 
+typedef uint8* (*p_gdi_image_convert)(uint8* srcData, uint8 *dstData, int width, int height, int srcBpp, int dstBpp, HCLRCONV clrconv);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/libfreerdp-gdi/decode.c b/libfreerdp-gdi/decode.c
index 133d706..9f3f4c0 100644
--- a/libfreerdp-gdi/decode.c
+++ b/libfreerdp-gdi/decode.c
@@ -30,67 +30,150 @@
 
 #include "decode.h"
 
-void gdi_decode_frame(GDI *gdi, int x, int y, uint8 * data, uint32 length)
+int gdi_decode_bitmap_data_ex(GDI *gdi, uint16 x, uint16 y, uint8 * data, int size)
 {
-	int i, tx, ty;
+	int i, j;
+	int tx, ty;
+	uint8* bitmapData;
+	uint32 bitmapDataLength;
 	RFX_MESSAGE * message;
 
-	message = rfx_process_message((RFX_CONTEXT *) gdi->rfx_context, data, length);
+	/* BITMAP_DATA_EX */
+	/* bpp (1 byte) */
+	/* reserved1 (1 byte) */
+	/* reserved2 (1 byte) */
+	/* codecID (1 byte) */
+	/* width (2 bytes) */
+	/* height (2 bytes) */
+	bitmapDataLength = GET_UINT32(data, 8); /* bitmapDataLength (4 bytes) */
+	bitmapData = data + 12; /* bitmapData */
 
-	for (i = 0; i < message->num_rects; i++)
+	/* decode bitmap data */
+	message = rfx_process_message((RFX_CONTEXT *) gdi->rfx_context, bitmapData, bitmapDataLength);
+
+	if (message->num_rects > 1) /* RDVH */
 	{
-		tx = message->rects[i].x + x;
-		ty = message->rects[i].y + y;
-		gdi_SetClipRgn(gdi->primary->hdc, tx, ty, message->rects[i].width, message->rects[i].height);
-	}
+		/* blit each tile */
+		for (i = 0; i < message->num_tiles; i++)
+		{
+			tx = message->tiles[i]->x + x;
+			ty = message->tiles[i]->y + y;
+			data = message->tiles[i]->data;
+
+			gdi_image_convert(data, gdi->tile->bitmap->data, 64, 64, 32, 32, gdi->clrconv);
+
+			for (j = 0; j < message->num_rects; j++)
+			{
+				gdi_SetClipRgn(gdi->primary->hdc,
+						message->rects[j].x, message->rects[j].y,
+						message->rects[j].width, message->rects[j].height);
+
+				gdi_BitBlt(gdi->primary->hdc, tx, ty, 64, 64, gdi->tile->hdc, 0, 0, GDI_SRCCOPY);
+			}
+		}
 
-	for (i = 0; i < message->num_tiles; i++)
+		for (i = 0; i < message->num_rects; i++)
+		{
+			gdi_InvalidateRegion(gdi->primary->hdc,
+					message->rects[i].x, message->rects[i].y,
+					message->rects[i].width, message->rects[i].height);
+		}
+	}
+	else /* RDSH */
 	{
-		tx = message->tiles[i]->x + x;
-		ty = message->tiles[i]->y + y;
-		data = message->tiles[i]->data;
+		/* blit each tile */
+		for (i = 0; i < message->num_tiles; i++)
+		{
+			tx = message->tiles[i]->x + x;
+			ty = message->tiles[i]->y + y;
+			data = message->tiles[i]->data;
 
-		gdi_image_convert(data, gdi->tile->bitmap->data, 64, 64, 32, 32, gdi->clrconv);
-		gdi_BitBlt(gdi->primary->hdc, tx, ty, 64, 64, gdi->tile->hdc, 0, 0, GDI_SRCCOPY);
+			gdi_image_convert(data, gdi->tile->bitmap->data, 64, 64, 32, 32, gdi->clrconv);
 
-		gdi_InvalidateRegion(gdi->primary->hdc, tx, ty, 64, 64);
+			gdi_BitBlt(gdi->primary->hdc, tx, ty, 64, 64, gdi->tile->hdc, 0, 0, GDI_SRCCOPY);
+
+			gdi_InvalidateRegion(gdi->primary->hdc, tx, ty, 64, 64);
+		}
 	}
 
 	rfx_message_free(gdi->rfx_context, message);
+
+	return bitmapDataLength + 12;
 }
 
-void gdi_decode_data(GDI *gdi, uint8 * data, int data_size)
+int gdi_decode_surface_bits(GDI *gdi, uint8 * data, int size)
 {
-	int size;
-	int destLeft;
-	int destTop;
+	int length;
+	uint16 destLeft;
+	uint16 destTop;
+	uint16 destRight;
+	uint16 destBottom;
+
+	/* SURFCMD_STREAM_SURF_BITS */
+	/* cmdType (2 bytes) */
+	destLeft = GET_UINT16(data, 2); /* destLeft (2 bytes) */
+	destTop = GET_UINT16(data, 4); /* destTop (2 bytes) */
+	destRight = GET_UINT16(data, 6); /* destRight (2 bytes) */
+	destBottom = GET_UINT16(data, 8); /* destBottom (2 bytes) */
+
+	/* set clipping region */
+	gdi_SetClipRgn(gdi->primary->hdc, destLeft, destTop, destRight - destLeft, destBottom - destTop);
+
+	/* decode extended bitmap data */
+	length = gdi_decode_bitmap_data_ex(gdi, destLeft, destTop, data + 10, size - 10) + 10;
+
+	return length;
+}
+
+int gdi_decode_frame_marker(GDI *gdi, uint8 * data, int size)
+{
+	uint16 frameAction;
+	uint32 frameId;
+
+	frameAction = GET_UINT16(data, 0); /* frameAction */
+	frameId = GET_UINT32(data, 2); /* frameId */
+
+	switch (frameAction)
+	{
+		case SURFACECMD_FRAMEACTION_BEGIN:
+			break;
+
+		case SURFACECMD_FRAMEACTION_END:
+			break;
+
+		default:
+			break;
+	}
+
+	return 8;
+}
+
+void gdi_decode_data(GDI *gdi, uint8 * data, int size)
+{
+	int cmdLength;
 	uint16 cmdType;
-	uint32 length;
 
-	while (data_size > 0)
+	while (size > 0)
 	{
-		cmdType = GET_UINT16(data, 0);
+		cmdType = GET_UINT16(data, 0); /* cmdType */
 
 		switch (cmdType)
 		{
 			case CMDTYPE_SET_SURFACE_BITS:
 			case CMDTYPE_STREAM_SURFACE_BITS:
-				destLeft = GET_UINT16(data, 2);
-				destTop = GET_UINT16(data, 4);
-				length = GET_UINT32(data, 18);
-				gdi_decode_frame(gdi, destLeft, destTop, data + 22, length);
-				size = 22 + length;
+				cmdLength = gdi_decode_surface_bits(gdi, data, size);
 				break;
 
 			case CMDTYPE_FRAME_MARKER:
-				size = 8;
+				cmdLength = gdi_decode_frame_marker(gdi, data, size);
 				break;
 
 			default:
-				size = 2;
+				cmdLength = 2;
 				break;
 		}
-		data_size -= size;
-		data += size;
+
+		size -= cmdLength;
+		data += cmdLength;
 	}
 }
diff --git a/libfreerdp-gdi/decode.h b/libfreerdp-gdi/decode.h
index ccadb29..408241a 100644
--- a/libfreerdp-gdi/decode.h
+++ b/libfreerdp-gdi/decode.h
@@ -25,7 +25,7 @@
 
 #include "gdi.h"
 
-void gdi_decode_frame(GDI *gdi, int x, int y, uint8 * data, uint32 length);
-void gdi_decode_data(GDI *gdi, uint8 * data, int data_size);
+void gdi_decode_bitmap_data(GDI *gdi, int x, int y, uint8 * data, uint32 length);
+void gdi_decode_data(GDI *gdi, uint8 * data, int size);
 
 #endif /* __DECODE_H */
diff --git a/libfreerdp-gdi/gdi.c b/libfreerdp-gdi/gdi.c
index 8e86736..ab25825 100644
--- a/libfreerdp-gdi/gdi.c
+++ b/libfreerdp-gdi/gdi.c
@@ -23,19 +23,7 @@
 #include <freerdp/rfx.h>
 #include <freerdp/freerdp.h>
 
-#include "color.h"
-#include "decode.h"
-
-#include "gdi_dc.h"
-#include "gdi_pen.h"
-#include "gdi_line.h"
-#include "gdi_shape.h"
-#include "gdi_brush.h"
-#include "gdi_region.h"
-#include "gdi_bitmap.h"
-#include "gdi_palette.h"
-#include "gdi_drawing.h"
-#include "gdi_clipping.h"
+#include "libgdi.h"
 
 #include "gdi.h"
 
@@ -1188,6 +1176,11 @@ gdi_init(rdpInst * inst, uint32 flags)
 
 	gdi_register_callbacks(inst);
 
+	gdi->BitBlt = gdi_BitBlt;
+	gdi->gdi_image_convert = gdi_image_convert;
+
+	GDI_INIT_SIMD(gdi);
+
 	return 0;
 }
 
@@ -1197,6 +1190,8 @@ void gdi_free(rdpInst* inst)
 
 	if (gdi)
 	{
+		gdi_bitmap_free(gdi->tile);
+		rfx_context_free(gdi->rfx_context);
 		gdi_bitmap_free(gdi->primary);
 		gdi_DeleteObject((HGDIOBJECT) gdi->hdc);
 		free(gdi->clrconv);
diff --git a/libfreerdp-gdi/gdi.h b/libfreerdp-gdi/gdi.h
index 9398d36..38f2194 100644
--- a/libfreerdp-gdi/gdi.h
+++ b/libfreerdp-gdi/gdi.h
@@ -229,6 +229,17 @@ struct _GDI_IMAGE
 typedef struct _GDI_IMAGE GDI_IMAGE;
 typedef GDI_IMAGE* HGDI_IMAGE;
 
+#include "gdi_dc.h"
+#include "gdi_pen.h"
+#include "gdi_line.h"
+#include "gdi_shape.h"
+#include "gdi_brush.h"
+#include "gdi_region.h"
+#include "gdi_bitmap.h"
+#include "gdi_palette.h"
+#include "gdi_drawing.h"
+#include "gdi_clipping.h"
+
 struct _GDI
 {
 	int width;
@@ -247,9 +258,15 @@ struct _GDI
 	GDI_COLOR textColor;
 	void * rfx_context;
 	GDI_IMAGE *tile;
+
+	/* callbacks */
+	p_gdi_BitBlt BitBlt;
+	p_gdi_image_convert gdi_image_convert;
 };
 typedef struct _GDI GDI;
 
+#include "decode.h"
+
 uint32 gdi_rop3_code(uint8 code);
 void gdi_copy_mem(uint8 *d, uint8 *s, int n);
 void gdi_copy_memb(uint8 *d, uint8 *s, int n);
diff --git a/libfreerdp-gdi/gdi_bitmap.c b/libfreerdp-gdi/gdi_bitmap.c
index 973a4ea..c5af510 100644
--- a/libfreerdp-gdi/gdi_bitmap.c
+++ b/libfreerdp-gdi/gdi_bitmap.c
@@ -31,7 +31,7 @@
 
 #include "gdi_bitmap.h"
 
-pBitBlt BitBlt_[5] =
+p_gdi_BitBlt_bpp BitBlt_[5] =
 {
 	NULL,
 	BitBlt_8bpp,
@@ -180,7 +180,7 @@ HGDI_BITMAP gdi_CreateCompatibleBitmap(HGDI_DC hdc, int nWidth, int nHeight)
 
 int gdi_BitBlt(HGDI_DC hdcDest, int nXDest, int nYDest, int nWidth, int nHeight, HGDI_DC hdcSrc, int nXSrc, int nYSrc, int rop)
 {
-	pBitBlt _BitBlt = BitBlt_[IBPP(hdcDest->bitsPerPixel)];
+	p_gdi_BitBlt_bpp _BitBlt = BitBlt_[IBPP(hdcDest->bitsPerPixel)];
 
 	if (_BitBlt != NULL)
 		return _BitBlt(hdcDest, nXDest, nYDest, nWidth, nHeight, hdcSrc, nXSrc, nYSrc, rop);
diff --git a/libfreerdp-gdi/gdi_bitmap.h b/libfreerdp-gdi/gdi_bitmap.h
index f6943a0..e4f7604 100644
--- a/libfreerdp-gdi/gdi_bitmap.h
+++ b/libfreerdp-gdi/gdi_bitmap.h
@@ -37,6 +37,7 @@ HGDI_BITMAP gdi_CreateBitmap(int nWidth, int nHeight, int cBitsPerPixel, uint8*
 HGDI_BITMAP gdi_CreateCompatibleBitmap(HGDI_DC hdc, int nWidth, int nHeight);
 int gdi_BitBlt(HGDI_DC hdcDest, int nXDest, int nYDest, int nWidth, int nHeight, HGDI_DC hdcSrc, int nXSrc, int nYSrc, int rop);
 
-typedef int (*pBitBlt)(HGDI_DC hdcDest, int nXDest, int nYDest, int nWidth, int nHeight, HGDI_DC hdcSrc, int nXSrc, int nYSrc, int rop);
+typedef int (*p_gdi_BitBlt_bpp)(HGDI_DC hdcDest, int nXDest, int nYDest, int nWidth, int nHeight, HGDI_DC hdcSrc, int nXSrc, int nYSrc, int rop);
+typedef int (*p_gdi_BitBlt)(HGDI_DC hdcDest, int nXDest, int nYDest, int nWidth, int nHeight, HGDI_DC hdcSrc, int nXSrc, int nYSrc, int rop);
 
 #endif /* __GDI_BITMAP_H */
diff --git a/libfreerdp-gdi/libgdi.h b/libfreerdp-gdi/libgdi.h
new file mode 100644
index 0000000..22f48f1
--- /dev/null
+++ b/libfreerdp-gdi/libgdi.h
@@ -0,0 +1,35 @@
+/*
+   FreeRDP: A Remote Desktop Protocol client.
+   GDI Library
+
+   Copyright 2011 Marc-Andre Moreau <marcandre.moreau@gmail.com>
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#ifndef __LIBGDI_H
+#define __LIBGDI_H
+
+#ifdef WITH_SSE
+#include "gdi_sse.h"
+#endif
+
+#ifdef WITH_NEON
+#include "gdi_neon.h"
+#endif
+
+#ifndef GDI_INIT_SIMD
+#define GDI_INIT_SIMD(_gdi) do { } while (0)
+#endif
+
+#endif /* __LIBGDI_H */
diff --git a/libfreerdp-gdi/neon/Makefile.am b/libfreerdp-gdi/neon/Makefile.am
new file mode 100644
index 0000000..27aee48
--- /dev/null
+++ b/libfreerdp-gdi/neon/Makefile.am
@@ -0,0 +1,25 @@
+## Process this file with automake to produce Makefile.in
+
+# libfreerdp-gdi-neon
+noinst_LTLIBRARIES = libfreerdp-gdi-neon.la
+
+libfreerdp_gdi_neon_la_SOURCES =
+
+if WITH_NEON
+libfreerdp_gdi_neon_la_SOURCES += \
+	gdi_neon.c gdi_neon.h
+endif
+
+libfreerdp_gdi_neon_la_CFLAGS = \
+	-I$(top_srcdir) \
+	-I$(top_srcdir)/include \
+	-I..
+
+libfreerdp_gdi_neon_la_LDFLAGS =
+
+libfreerdp_gdi_neon_la_LIBDADD =
+
+# extra
+EXTRA_DIST =
+
+DISTCLEANFILES = 
diff --git a/libfreerdp-gdi/neon/gdi_neon.c b/libfreerdp-gdi/neon/gdi_neon.c
new file mode 100644
index 0000000..a0a92b7
--- /dev/null
+++ b/libfreerdp-gdi/neon/gdi_neon.c
@@ -0,0 +1,32 @@
+/*
+   FreeRDP: A Remote Desktop Protocol client.
+   GDI NEON Optimizations
+
+   Copyright 2011 Marc-Andre Moreau <marcandre.moreau@gmail.com>
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <freerdp/freerdp.h>
+#include "gdi.h"
+
+#include "gdi_neon.h"
+
+void gdi_init_neon(GDI* gdi)
+{
+
+}
diff --git a/libfreerdp-gdi/neon/gdi_neon.h b/libfreerdp-gdi/neon/gdi_neon.h
new file mode 100644
index 0000000..ef94d46
--- /dev/null
+++ b/libfreerdp-gdi/neon/gdi_neon.h
@@ -0,0 +1,31 @@
+/*
+   FreeRDP: A Remote Desktop Protocol client.
+   GDI NEON Optimizations
+
+   Copyright 2011 Marc-Andre Moreau <marcandre.moreau@gmail.com>
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#ifndef __GDI_NEON_H
+#define __GDI_NEON_H
+
+#include "gdi.h"
+
+void gdi_init_neon(GDI* gdi);
+
+#ifndef GDI_INIT_SIMD
+#define GDI_INIT_SIMD(_gdi) gdi_init_neon(_gdi)
+#endif
+
+#endif /* __GDI_NEON_H */
diff --git a/libfreerdp-gdi/sse/Makefile.am b/libfreerdp-gdi/sse/Makefile.am
new file mode 100644
index 0000000..23f6e27
--- /dev/null
+++ b/libfreerdp-gdi/sse/Makefile.am
@@ -0,0 +1,25 @@
+## Process this file with automake to produce Makefile.in
+
+# libfreerdp-gdi-sse
+noinst_LTLIBRARIES = libfreerdp-gdi-sse.la
+
+libfreerdp_gdi_sse_la_SOURCES =
+
+if WITH_SSE
+libfreerdp_gdi_sse_la_SOURCES += \
+	gdi_sse.c gdi_sse.h
+endif
+
+libfreerdp_gdi_sse_la_CFLAGS = \
+	-I$(top_srcdir) \
+	-I$(top_srcdir)/include \
+	-I..
+
+libfreerdp_gdi_sse_la_LDFLAGS =
+
+libfreerdp_gdi_sse_la_LIBDADD =
+
+# extra
+EXTRA_DIST =
+
+DISTCLEANFILES = 
diff --git a/libfreerdp-gdi/sse/gdi_sse.c b/libfreerdp-gdi/sse/gdi_sse.c
new file mode 100644
index 0000000..fb0cc8a
--- /dev/null
+++ b/libfreerdp-gdi/sse/gdi_sse.c
@@ -0,0 +1,32 @@
+/*
+   FreeRDP: A Remote Desktop Protocol client.
+   GDI SSE Optimizations
+
+   Copyright 2011 Marc-Andre Moreau <marcandre.moreau@gmail.com>
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <freerdp/freerdp.h>
+#include "gdi.h"
+
+#include "gdi_sse.h"
+
+void gdi_init_sse(GDI* gdi)
+{
+
+}
diff --git a/libfreerdp-gdi/sse/gdi_sse.h b/libfreerdp-gdi/sse/gdi_sse.h
new file mode 100644
index 0000000..e326503
--- /dev/null
+++ b/libfreerdp-gdi/sse/gdi_sse.h
@@ -0,0 +1,31 @@
+/*
+   FreeRDP: A Remote Desktop Protocol client.
+   GDI SSE Optimizations
+
+   Copyright 2011 Marc-Andre Moreau <marcandre.moreau@gmail.com>
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#ifndef __GDI_SSE_H
+#define __GDI_SSE_H
+
+#include "gdi.h"
+
+void gdi_init_sse(GDI* gdi);
+
+#ifndef GDI_INIT_SIMD
+#define GDI_INIT_SIMD(_gdi) gdi_init_sse(_gdi)
+#endif
+
+#endif /* __GDI_SSE_H */
diff --git a/libfreerdp-rfx/Makefile.am b/libfreerdp-rfx/Makefile.am
index bc60460..908dd06 100644
--- a/libfreerdp-rfx/Makefile.am
+++ b/libfreerdp-rfx/Makefile.am
@@ -12,6 +12,7 @@ libfreerdp_rfx_la_SOURCES = \
 	rfx_quantization.c rfx_quantization.h \
 	rfx_dwt.c rfx_dwt.h \
 	rfx_decode.c rfx_decode.h \
+	rfx_encode.c rfx_encode.h \
 	rfx_pool.c rfx_pool.h \
 	librfx.c librfx.h
 
@@ -24,10 +25,17 @@ libfreerdp_rfx_la_LDFLAGS =
 libfreerdp_rfx_la_LIBADD =
 
 if WITH_SSE
+SUBDIRS = sse
 libfreerdp_rfx_la_CFLAGS += -I$(top_srcdir)/libfreerdp-rfx/sse
 libfreerdp_rfx_la_LIBADD += sse/libfreerdp-rfx-sse.la
 endif
 
+if WITH_NEON
+SUBDIRS = neon
+libfreerdp_rfx_la_CFLAGS += -I$(top_srcdir)/libfreerdp-rfx/neon
+libfreerdp_rfx_la_LIBADD += neon/libfreerdp-rfx-neon.la
+endif
+
 # extra
 EXTRA_DIST =
 
diff --git a/libfreerdp-rfx/librfx.c b/libfreerdp-rfx/librfx.c
index 6bc9f66..5d9b5a6 100644
--- a/libfreerdp-rfx/librfx.c
+++ b/libfreerdp-rfx/librfx.c
@@ -27,9 +27,96 @@
 
 #include "rfx_pool.h"
 #include "rfx_decode.h"
+#include "rfx_encode.h"
+#include "rfx_quantization.h"
+#include "rfx_dwt.h"
 
 #include "librfx.h"
 
+/*
+   The quantization values control the compression rate and quality. The value
+   range is between 6 and 15. The higher value, the higher compression rate
+   and lower quality.
+
+   This is the default values being use by the MS RDP server, and we will also
+   use it as our default values for the encoder. It can be overrided by setting
+   the context->num_quants and context->quants member.
+
+   The order of the values are:
+   LL3, LH3, HL3, HH3, LH2, HL2, HH2, LH1, HL1, HH1
+*/
+static const uint32 rfx_default_quantization_values[] =
+{
+	6, 6, 6, 6, 7, 7, 8, 8, 8, 9
+};
+
+void rfx_profiler_create(RFX_CONTEXT * context)
+{
+	PROFILER_CREATE(context->prof_rfx_decode_rgb, "rfx_decode_rgb");
+	PROFILER_CREATE(context->prof_rfx_decode_component, "rfx_decode_component");
+	PROFILER_CREATE(context->prof_rfx_rlgr_decode, "rfx_rlgr_decode");
+	PROFILER_CREATE(context->prof_rfx_differential_decode, "rfx_differential_decode");
+	PROFILER_CREATE(context->prof_rfx_quantization_decode, "rfx_quantization_decode");
+	PROFILER_CREATE(context->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode");
+	PROFILER_CREATE(context->prof_rfx_decode_YCbCr_to_RGB, "rfx_decode_YCbCr_to_RGB");
+	PROFILER_CREATE(context->prof_rfx_decode_format_RGB, "rfx_decode_format_RGB");
+
+	PROFILER_CREATE(context->prof_rfx_encode_rgb, "rfx_encode_rgb");
+	PROFILER_CREATE(context->prof_rfx_encode_component, "rfx_encode_component");
+	PROFILER_CREATE(context->prof_rfx_rlgr_encode, "rfx_rlgr_encode");
+	PROFILER_CREATE(context->prof_rfx_differential_encode, "rfx_differential_encode");
+	PROFILER_CREATE(context->prof_rfx_quantization_encode, "rfx_quantization_encode");
+	PROFILER_CREATE(context->prof_rfx_dwt_2d_encode, "rfx_dwt_2d_encode");
+	PROFILER_CREATE(context->prof_rfx_encode_RGB_to_YCbCr, "rfx_encode_RGB_to_YCbCr");
+	PROFILER_CREATE(context->prof_rfx_encode_format_RGB, "rfx_encode_format_RGB");
+}
+
+void rfx_profiler_free(RFX_CONTEXT * context)
+{
+	PROFILER_FREE(context->prof_rfx_decode_rgb);
+	PROFILER_FREE(context->prof_rfx_decode_component);
+	PROFILER_FREE(context->prof_rfx_rlgr_decode);
+	PROFILER_FREE(context->prof_rfx_differential_decode);
+	PROFILER_FREE(context->prof_rfx_quantization_decode);
+	PROFILER_FREE(context->prof_rfx_dwt_2d_decode);
+	PROFILER_FREE(context->prof_rfx_decode_YCbCr_to_RGB);
+	PROFILER_FREE(context->prof_rfx_decode_format_RGB);
+
+	PROFILER_FREE(context->prof_rfx_encode_rgb);
+	PROFILER_FREE(context->prof_rfx_encode_component);
+	PROFILER_FREE(context->prof_rfx_rlgr_encode);
+	PROFILER_FREE(context->prof_rfx_differential_encode);
+	PROFILER_FREE(context->prof_rfx_quantization_encode);
+	PROFILER_FREE(context->prof_rfx_dwt_2d_encode);
+	PROFILER_FREE(context->prof_rfx_encode_RGB_to_YCbCr);
+	PROFILER_FREE(context->prof_rfx_encode_format_RGB);
+}
+
+void rfx_profiler_print(RFX_CONTEXT * context)
+{
+	PROFILER_PRINT_HEADER;
+
+	PROFILER_PRINT(context->prof_rfx_decode_rgb);
+	PROFILER_PRINT(context->prof_rfx_decode_component);
+	PROFILER_PRINT(context->prof_rfx_rlgr_decode);
+	PROFILER_PRINT(context->prof_rfx_differential_decode);
+	PROFILER_PRINT(context->prof_rfx_quantization_decode);
+	PROFILER_PRINT(context->prof_rfx_dwt_2d_decode);
+	PROFILER_PRINT(context->prof_rfx_decode_YCbCr_to_RGB);
+	PROFILER_PRINT(context->prof_rfx_decode_format_RGB);
+
+	PROFILER_PRINT(context->prof_rfx_encode_rgb);
+	PROFILER_PRINT(context->prof_rfx_encode_component);
+	PROFILER_PRINT(context->prof_rfx_rlgr_encode);
+	PROFILER_PRINT(context->prof_rfx_differential_encode);
+	PROFILER_PRINT(context->prof_rfx_quantization_encode);
+	PROFILER_PRINT(context->prof_rfx_dwt_2d_encode);
+	PROFILER_PRINT(context->prof_rfx_encode_RGB_to_YCbCr);
+	PROFILER_PRINT(context->prof_rfx_encode_format_RGB);
+
+	PROFILER_PRINT_FOOTER;
+}
+
 RFX_CONTEXT *
 rfx_context_new(void)
 {
@@ -40,17 +127,26 @@ rfx_context_new(void)
 
 	context->pool = rfx_pool_new();
 
+	/* initialize the default pixel format */
+	rfx_context_set_pixel_format(context, RFX_PIXEL_FORMAT_BGRA);
+
 	/* align buffers to 16 byte boundary (needed for SSE/SSE2 instructions) */
-	context->y_r_buffer = (uint32 *)(((uintptr_t)context->y_r_mem + 16) & ~ 0x0F);
-	context->cb_g_buffer = (uint32 *)(((uintptr_t)context->cb_g_mem + 16) & ~ 0x0F);
-	context->cr_b_buffer = (uint32 *)(((uintptr_t)context->cr_b_mem + 16) & ~ 0x0F);
+	context->y_r_buffer = (sint16 *)(((uintptr_t)context->y_r_mem + 16) & ~ 0x0F);
+	context->cb_g_buffer = (sint16 *)(((uintptr_t)context->cb_g_mem + 16) & ~ 0x0F);
+	context->cr_b_buffer = (sint16 *)(((uintptr_t)context->cr_b_mem + 16) & ~ 0x0F);
+
+	context->dwt_buffer = (sint16 *)(((uintptr_t)context->dwt_mem + 16) & ~ 0x0F);
 
-	context->idwt_buffers[1] = (uint32*) context->idwt_buffer_8;
-	context->idwt_buffers[2] = (uint32*) context->idwt_buffer_16;
-	context->idwt_buffers[4] = (uint32*) context->idwt_buffer_32;
+	/* create profilers for default decoding routines */
+	rfx_profiler_create(context);
 	
-	/* set up default decoding routines */
+	/* set up default routines */
 	context->decode_YCbCr_to_RGB = rfx_decode_YCbCr_to_RGB;
+	context->encode_RGB_to_YCbCr = rfx_encode_RGB_to_YCbCr;
+	context->quantization_decode = rfx_quantization_decode;	
+	context->quantization_encode = rfx_quantization_encode;	
+	context->dwt_2d_decode = rfx_dwt_2d_decode;
+	context->dwt_2d_encode = rfx_dwt_2d_encode;
 
 	/* detect and enable SIMD CPU acceleration */
 	RFX_INIT_SIMD(context);
@@ -66,6 +162,9 @@ rfx_context_free(RFX_CONTEXT * context)
 
 	rfx_pool_free(context->pool);
 
+	rfx_profiler_print(context);
+	rfx_profiler_free(context);
+
 	if (context != NULL)
 		free(context);
 }
@@ -74,14 +173,29 @@ void
 rfx_context_set_pixel_format(RFX_CONTEXT * context, RFX_PIXEL_FORMAT pixel_format)
 {
 	context->pixel_format = pixel_format;
+	switch (pixel_format)
+	{
+		case RFX_PIXEL_FORMAT_BGRA:
+		case RFX_PIXEL_FORMAT_RGBA:
+			context->bytes_per_pixel = 4;
+			break;
+		case RFX_PIXEL_FORMAT_BGR:
+		case RFX_PIXEL_FORMAT_RGB:
+			context->bytes_per_pixel = 3;
+			break;
+		default:
+			context->bytes_per_pixel = 0;
+			break;
+	}
 }
 
 static void
-rfx_process_message_sync(RFX_CONTEXT * context, uint8 * data, int data_size)
+rfx_process_message_sync(RFX_CONTEXT * context, uint8 * data, int size)
 {
 	uint32 magic;
 
-	magic = GET_UINT32(data, 0);
+	/* RFX_SYNC */
+	magic = GET_UINT32(data, 0); /* magic (4 bytes), 0xCACCACCA */
 
 	if (magic != WF_MAGIC)
 	{
@@ -89,7 +203,7 @@ rfx_process_message_sync(RFX_CONTEXT * context, uint8 * data, int data_size)
 		return;
 	}
 
-	context->version = GET_UINT16(data, 4);
+	context->version = GET_UINT16(data, 4); /* version (2 bytes), WF_VERSION_1_0 (0x0100) */
 
 	if (context->version != WF_VERSION_1_0)
 	{
@@ -101,70 +215,68 @@ rfx_process_message_sync(RFX_CONTEXT * context, uint8 * data, int data_size)
 }
 
 static void
-rfx_process_message_codec_versions(RFX_CONTEXT * context, uint8 * data, int data_size)
+rfx_process_message_codec_versions(RFX_CONTEXT * context, uint8 * data, int size)
 {
 	int numCodecs;
 
-	numCodecs = GET_UINT8(data, 0);
+	numCodecs = GET_UINT8(data, 0); /* numCodecs (1 byte), must be set to 0x01 */
 
-	if (numCodecs < 1)
+	if (numCodecs != 1)
 	{
-		DEBUG_RFX("no version.");
+		DEBUG_RFX("numCodecs: %d, expected:1", numCodecs);
 		return;
 	}
 
-	context->codec_id = GET_UINT8(data, 1);
-	context->codec_version = GET_UINT16(data, 2);
+	/* RFX_CODEC_VERSIONT */
+	context->codec_id = GET_UINT8(data, 1); /* codecId (1 byte) */
+	context->codec_version = GET_UINT16(data, 2); /* version (2 bytes) */
 
 	DEBUG_RFX("id %d version 0x%X.", context->codec_id, context->codec_version);
 }
 
 static void
-rfx_process_message_channels(RFX_CONTEXT * context, uint8 * data, int data_size)
+rfx_process_message_channels(RFX_CONTEXT * context, uint8 * data, int size)
 {
 	int channelId;
-	int numChannels;
+	uint8 numChannels;
 
-	numChannels = GET_UINT8(data, 0);
+	numChannels = GET_UINT8(data, 0); /* numChannels (1 byte), must bet set to 0x01 */
 
-	if (numChannels < 1)
+	if (numChannels != 1)
 	{
-		DEBUG_RFX("no channel.");
+		DEBUG_RFX("numChannels:%d, expected:1", numChannels);
 		return;
 	}
 
-	channelId = GET_UINT8(data, 1);
-	context->width = GET_UINT16(data, 2);
-	context->height = GET_UINT16(data, 4);
+	/* RFX_CHANNELT */
+	channelId = GET_UINT8(data, 1); /* channelId (1 byte) */
+	context->width = GET_UINT16(data, 2); /* width (2 bytes) */
+	context->height = GET_UINT16(data, 4); /* height (2 bytes) */
 
 	DEBUG_RFX("numChannels %d id %d, %dx%d.",
 		numChannels, channelId, context->width, context->height);
 }
 
 static void
-rfx_process_message_context(RFX_CONTEXT * context, uint8 * data, int data_size)
+rfx_process_message_context(RFX_CONTEXT * context, uint8 * data, int size)
 {
 	uint8 ctxId;
-	uint8 codecId;
-	uint8 channelId;
 	uint16 tileSize;
 	uint16 properties;
 
-	codecId = GET_UINT8(data, 0);
-	channelId = GET_UINT8(data, 1);
-	ctxId = GET_UINT8(data, 2);
-	tileSize = GET_UINT16(data, 3);
-	properties = GET_UINT16(data, 5);
+	ctxId = GET_UINT8(data, 0); /* ctxId (1 byte), must be set to 0x00 */
+	tileSize = GET_UINT16(data, 1); /* tileSize (2 bytes), must be set to CT_TILE_64x64 (0x0040) */
+	properties = GET_UINT16(data, 3); /* properties (2 bytes) */
 
-	DEBUG_RFX("codec %d channel %d ctx %d tileSize %d properties 0x%X.",
-		codecId, channelId, ctxId, tileSize, properties);
+	DEBUG_RFX("ctxId %d tileSize %d properties 0x%X.", ctxId, tileSize, properties);
 
+	context->properties = properties;
 	context->flags = (properties & 0x0007);
 
 	if (context->flags == CODEC_MODE)
-		DEBUG_RFX("codec in image mode.");
+		DEBUG_RFX("codec is in image mode.");
 	else
-		DEBUG_RFX("codec in video mode.");
+		DEBUG_RFX("codec is in video mode.");
 
 	switch ((properties & 0x1E00) >> 9)
 	{
@@ -185,11 +297,30 @@ rfx_process_message_context(RFX_CONTEXT * context, uint8 * data, int data_size)
 }
 
 static void
-rfx_process_message_region(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8 * data, int data_size)
+rfx_process_message_frame_begin(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8 * data, int size)
+{
+	uint32 frameIdx;
+	uint16 numRegions;
+
+	frameIdx = GET_UINT32(data, 0); /* frameIdx (4 bytes), if codec is in video mode, must be ignored */
+	numRegions = GET_UINT16(data, 4); /* numRegions (2 bytes) */
+
+	DEBUG_RFX("RFX_FRAME_BEGIN: frameIdx:%d numRegions:%d", frameIdx, numRegions);
+}
+
+static void
+rfx_process_message_frame_end(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8 * data, int size)
+{
+	DEBUG_RFX("RFX_FRAME_END");
+}
+
+static void
+rfx_process_message_region(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8 * data, int size)
 {
 	int i;
 
-	message->num_rects = GET_UINT16(data, 3);
+	/* regionFlags (1 byte) */
+	message->num_rects = GET_UINT16(data, 1); /* numRects (2 bytes) */
 
 	if (message->num_rects < 1)
 	{
@@ -202,26 +333,28 @@ rfx_process_message_region(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8 *
 	else
 		message->rects = (RFX_RECT*) malloc(message->num_rects * sizeof(RFX_RECT));
 
-	data += 5;
-	data_size -= 5;
+	data += 3;
+	size -= 3;
 
-	for (i = 0; i < message->num_rects && data_size > 0; i++)
+	/* rects */
+	for (i = 0; i < message->num_rects && size > 0; i++)
 	{
-		message->rects[i].x = GET_UINT16(data, 0);
-		message->rects[i].y = GET_UINT16(data, 2);
-		message->rects[i].width = GET_UINT16(data, 4);
-		message->rects[i].height = GET_UINT16(data, 6);
+		/* RFX_RECT */
+		message->rects[i].x = GET_UINT16(data, 0); /* x (2 bytes) */
+		message->rects[i].y = GET_UINT16(data, 2); /* y (2 bytes) */
+		message->rects[i].width = GET_UINT16(data, 4); /* width (2 bytes) */
+		message->rects[i].height = GET_UINT16(data, 6); /* height (2 bytes) */
 
 		DEBUG_RFX("rect %d (%d %d %d %d).",
 			i, message->rects[i].x, message->rects[i].y, message->rects[i].width, message->rects[i].height);
 
 		data += 8;
-		data_size -= 8;
+		size -= 8;
 	}
 }
 
 static void
-rfx_process_message_tile(RFX_CONTEXT * context, RFX_TILE * tile, uint8 * data, int data_size)
+rfx_process_message_tile(RFX_CONTEXT * context, RFX_TILE * tile, uint8 * data, int size)
 {
 	uint8 quantIdxY;
 	uint8 quantIdxCb;
@@ -229,14 +362,15 @@ rfx_process_message_tile(RFX_CONTEXT * context, RFX_TILE * tile, uint8 * data, i
 	uint16 xIdx, yIdx;
 	uint16 YLen, CbLen, CrLen;
 
-	quantIdxY = GET_UINT8(data, 0);
-	quantIdxCb = GET_UINT8(data, 1);
-	quantIdxCr = GET_UINT8(data, 2);
-	xIdx = GET_UINT16(data, 3);
-	yIdx = GET_UINT16(data, 5);
-	YLen = GET_UINT16(data, 7);
-	CbLen = GET_UINT16(data, 9);
-	CrLen = GET_UINT16(data, 11);
+	/* RFX_TILE */
+	quantIdxY = GET_UINT8(data, 0); /* quantIdxY (1 byte) */
+	quantIdxCb = GET_UINT8(data, 1); /* quantIdxCb (1 byte) */
+	quantIdxCr = GET_UINT8(data, 2); /* quantIdxCr (1 byte) */
+	xIdx = GET_UINT16(data, 3); /* xIdx (2 bytes) */
+	yIdx = GET_UINT16(data, 5); /* yIdx (2 bytes) */
+	YLen = GET_UINT16(data, 7); /* YLen (2 bytes) */
+	CbLen = GET_UINT16(data, 9); /* CbLen (2 bytes) */
+	CrLen = GET_UINT16(data, 11); /* CrLen (2 bytes) */
 
 	DEBUG_RFX("quantIdxY:%d quantIdxCb:%d quantIdxCr:%d xIdx:%d yIdx:%d YLen:%d CbLen:%d CrLen:%d",
 		quantIdxY, quantIdxCb, quantIdxCr, xIdx, yIdx, YLen, CbLen, CrLen);
@@ -253,14 +387,27 @@ rfx_process_message_tile(RFX_CONTEXT * context, RFX_TILE * tile, uint8 * data, i
 }
 
 static void
-rfx_process_message_tileset(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8 * data, int data_size)
+rfx_process_message_tileset(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8 * data, int size)
 {
 	int i;
+	uint16 subtype;
 	uint32 blockLen;
 	uint32 blockType;
-	uint32 tileDataSize;
+	uint32 tilesDataSize;
+
+	subtype = GET_UINT16(data, 0); /* subtype (2 bytes) must be set to CBT_TILESET (0xCAC2) */
+
+	if (subtype != CBT_TILESET)
+	{
+		DEBUG_RFX("invalid subtype, expected CBT_TILESET.");
+		return;
+	}
 
-	context->num_quants = GET_UINT8(data, 4);
+	/* idx (2 bytes), must be set to 0x0000 */
+	/* properties (2 bytes) */
+
+	context->num_quants = GET_UINT8(data, 6); /* numQuant (1 byte) */
+	/* tileSize (1 byte), must be set to 0x40 */
 
 	if (context->num_quants < 1)
 	{
@@ -268,7 +415,7 @@ rfx_process_message_tileset(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8
 		return;
 	}
 
-	message->num_tiles = GET_UINT16(data, 6);
+	message->num_tiles = GET_UINT16(data, 8); /* numTiles (2 bytes) */
 
 	if (message->num_tiles < 1)
 	{
@@ -276,18 +423,20 @@ rfx_process_message_tileset(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8
 		return;
 	}
 
-	tileDataSize = GET_UINT32(data, 8);
+	tilesDataSize = GET_UINT32(data, 10); /* tilesDataSize (4 bytes) */
 
-	data += 12;
-	data_size -= 12;
+	data += 14;
+	size -= 14;
 
 	if (context->quants != NULL)
 		context->quants = (uint32*) realloc((void*) context->quants, context->num_quants * 10 * sizeof(uint32));
 	else
 		context->quants = (uint32*) malloc(context->num_quants * 10 * sizeof(uint32));
 
-	for (i = 0; i < context->num_quants && data_size > 0; i++)
+	/* quantVals */
+	for (i = 0; i < context->num_quants && size > 0; i++)
 	{
+		/* RFX_CODEC_QUANT */
 		context->quants[i * 10] = (data[0] & 0x0F);
 		context->quants[i * 10 + 1] = (data[0] >> 4);
 		context->quants[i * 10 + 2] = (data[1] & 0x0F);
@@ -307,36 +456,35 @@ rfx_process_message_tileset(RFX_CONTEXT * context, RFX_MESSAGE * message, uint8
 			context->quants[i * 10 + 8], context->quants[i * 10 + 9]);
 
 		data += 5;
-		data_size -= 5;
+		size -= 5;
 	}
 
 	message->tiles = rfx_pool_get_tiles(context->pool, message->num_tiles);
 
-	for (i = 0; i < message->num_tiles && data_size > 0; i++)
+	/* tiles */
+	for (i = 0; i < message->num_tiles && size > 0; i++)
 	{
-		blockType = GET_UINT16(data, 0);
-		blockLen = GET_UINT32(data, 2);
+		/* RFX_TILE */
+		blockType = GET_UINT16(data, 0); /* blockType (2 bytes), must be set to CBT_TILE (0xCAC3) */
+		blockLen = GET_UINT32(data, 2); /* blockLen (4 bytes) */
 
-		switch (blockType)
+		if (blockType != CBT_TILE)
 		{
-			case CBT_TILE:
-				rfx_process_message_tile(context, message->tiles[i], data + 6, blockLen - 6);
-				break;
-
-			default:
-				DEBUG_RFX("unknown block type 0x%X", blockType);
-				break;
+			DEBUG_RFX("unknown block type 0x%X, expected CBT_TILE (0xCAC3).", blockType);
+			break;
 		}
 
-		data_size -= blockLen;
+		rfx_process_message_tile(context, message->tiles[i], data + 6, blockLen - 6);
+
+		size -= blockLen;
 		data += blockLen;
 	}
 }
 
 RFX_MESSAGE *
-rfx_process_message(RFX_CONTEXT * context, uint8 * data, int data_size)
+rfx_process_message(RFX_CONTEXT * context, uint8 * data, int size)
 {
-	uint32 subtype;
+	uint32 offset;
 	uint32 blockLen;
 	uint32 blockType;
 	RFX_MESSAGE * message;
@@ -344,50 +492,55 @@ rfx_process_message(RFX_CONTEXT * context, uint8 * data, int data_size)
 	message = (RFX_MESSAGE *) malloc(sizeof(RFX_MESSAGE));
 	memset(message, 0, sizeof(RFX_MESSAGE));
 
-	while (data_size > 0)
+	while (size > 0)
 	{
-		blockType = GET_UINT16(data, 0);
-		blockLen = GET_UINT32(data, 2);
+		/* RFX_BLOCKT */
+		blockType = GET_UINT16(data, 0); /* blockType (2 bytes) */
+		blockLen = GET_UINT32(data, 2); /* blockLen (4 bytes) */
+		offset = 6;
+
 		DEBUG_RFX("blockType 0x%X blockLen %d", blockType, blockLen);
 
+		if (blockType >= WBT_CONTEXT && blockType <= WBT_EXTENSION)
+		{
+			/* RFX_CODEC_CHANNELT */
+			/* codecId (1 byte) must be set to 0x01 */
+			/* channelId (1 byte) must be set to 0x00 */
+			offset = 8;
+		}
+
 		switch (blockType)
 		{
 			case WBT_SYNC:
-				rfx_process_message_sync(context, data + 6, blockLen - 6);
+				rfx_process_message_sync(context, data + offset, blockLen - offset);
 				break;
 
 			case WBT_CODEC_VERSIONS:
-				rfx_process_message_codec_versions(context, data + 6, blockLen - 6);
+				rfx_process_message_codec_versions(context, data + offset, blockLen - offset);
 				break;
 
 			case WBT_CHANNELS:
-				rfx_process_message_channels(context, data + 6, blockLen - 6);
+				rfx_process_message_channels(context, data + offset, blockLen - offset);
 				break;
 
 			case WBT_CONTEXT:
-				rfx_process_message_context(context, data + 6, blockLen - 6);
+				rfx_process_message_context(context, data + offset, blockLen - offset);
 				break;
 
 			case WBT_FRAME_BEGIN:
+				rfx_process_message_frame_begin(context, message, data + offset, blockLen - offset);
+				break;
+
 			case WBT_FRAME_END:
-				/* Can be ignored. */
+				rfx_process_message_frame_end(context, message, data + offset, blockLen - offset);
 				break;
 
 			case WBT_REGION:
-				rfx_process_message_region(context, message, data + 6, blockLen - 6);
+				rfx_process_message_region(context, message, data + offset, blockLen - offset);
 				break;
 
 			case WBT_EXTENSION:
-				subtype = GET_UINT16(data, 8);
-				switch (subtype)
-				{
-					case CBT_TILESET:
-						rfx_process_message_tileset(context, message, data + 10, blockLen - 10);
-						break;
-					default:
-						DEBUG_RFX("unknown subtype 0x%X", subtype);
-						break;
-				}
+				rfx_process_message_tileset(context, message, data + offset, blockLen - offset);
 				break;
 
 			default:
@@ -395,7 +548,7 @@ rfx_process_message(RFX_CONTEXT * context, uint8 * data, int data_size)
 				break;
 		}
 
-		data_size -= blockLen;
+		size -= blockLen;
 		data += blockLen;
 	}
 
@@ -419,3 +572,320 @@ rfx_message_free(RFX_CONTEXT * context, RFX_MESSAGE * message)
 		free(message);
 	}
 }
+
+static int
+rfx_compose_message_sync(RFX_CONTEXT * context, uint8 * buffer, int buffer_size)
+{
+	if (buffer_size < 12)
+	{
+		printf("rfx_compose_message_sync: buffer size too small.\n");
+		return 0;
+	}
+
+	SET_UINT16(buffer, 0, WBT_SYNC); /* BlockT.blockType */
+	SET_UINT32(buffer, 2, 12); /* BlockT.blockLen */
+	SET_UINT32(buffer, 6, WF_MAGIC); /* magic */
+	SET_UINT16(buffer, 10, WF_VERSION_1_0); /* version */
+
+	return 12;
+}
+
+static int
+rfx_compose_message_codec_versions(RFX_CONTEXT * context, uint8 * buffer, int buffer_size)
+{
+	if (buffer_size < 10)
+	{
+		printf("rfx_compose_message_codec_versions: buffer size too small.\n");
+		return 0;
+	}
+
+	SET_UINT16(buffer, 0, WBT_CODEC_VERSIONS); /* BlockT.blockType */
+	SET_UINT32(buffer, 2, 10); /* BlockT.blockLen */
+	SET_UINT8(buffer, 6, 1); /* numCodecs */
+	SET_UINT8(buffer, 7, 1); /* codecs.codecId */
+	SET_UINT16(buffer, 8, WF_VERSION_1_0); /* codecs.version */
+
+	return 10;
+}
+
+static int
+rfx_compose_message_channels(RFX_CONTEXT * context, uint8 * buffer, int buffer_size)
+{
+	if (buffer_size < 12)
+	{
+		printf("rfx_compose_message_channels: buffer size too small.\n");
+		return 0;
+	}
+
+	SET_UINT16(buffer, 0, WBT_CHANNELS); /* BlockT.blockType */
+	SET_UINT32(buffer, 2, 12); /* BlockT.blockLen */
+	SET_UINT8(buffer, 6, 1); /* numChannels */
+	SET_UINT8(buffer, 7, 0); /* Channel.channelId */
+	SET_UINT16(buffer, 8, context->width); /* Channel.width */
+	SET_UINT16(buffer, 10, context->height); /* Channel.height */
+
+	return 12;
+}
+
+static int
+rfx_compose_message_context(RFX_CONTEXT * context, uint8 * buffer, int buffer_size)
+{
+	uint16 properties;
+
+	if (buffer_size < 13)
+	{
+		printf("rfx_compose_message_context: buffer size too small.\n");
+		return 0;
+	}
+
+	SET_UINT16(buffer, 0, WBT_CONTEXT); /* CodecChannelT.blockType */
+	SET_UINT32(buffer, 2, 13); /* CodecChannelT.blockLen */
+	SET_UINT8(buffer, 6, 1); /* CodecChannelT.codecId */
+	SET_UINT8(buffer, 7, 0); /* CodecChannelT.channelId */
+	SET_UINT8(buffer, 8, 0); /* ctxId */
+	SET_UINT16(buffer, 9, CT_TILE_64x64); /* tileSize */
+
+	/* properties */
+	properties = context->flags; /* flags */
+	properties |= (COL_CONV_ICT << 3); /* cct */
+	properties |= (CLW_XFORM_DWT_53_A << 5); /* xft */
+	properties |= ((context->mode == RLGR1 ? CLW_ENTROPY_RLGR1 : CLW_ENTROPY_RLGR3) << 9); /* et */
+	properties |= (SCALAR_QUANTIZATION << 13); /* qt */
+	SET_UINT16(buffer, 11, properties);
+	context->properties = properties;
+
+	return 13;
+}
+
+int
+rfx_compose_message_header(RFX_CONTEXT * context, uint8 * buffer, int buffer_size)
+{
+	int composed_size;
+
+	composed_size = rfx_compose_message_sync(context, buffer, buffer_size);
+	composed_size += rfx_compose_message_codec_versions(context, buffer + composed_size, buffer_size - composed_size);
+	composed_size += rfx_compose_message_channels(context, buffer + composed_size, buffer_size - composed_size);
+	composed_size += rfx_compose_message_context(context, buffer + composed_size, buffer_size - composed_size);
+
+	return composed_size;
+}
+
+static int
+rfx_compose_message_frame_begin(RFX_CONTEXT * context, uint8 * buffer, int buffer_size)
+{
+	if (buffer_size < 14)
+	{
+		printf("rfx_compose_message_frame_begin: buffer size too small.\n");
+		return 0;
+	}
+
+	SET_UINT16(buffer, 0, WBT_FRAME_BEGIN); /* CodecChannelT.blockType */
+	SET_UINT32(buffer, 2, 14); /* CodecChannelT.blockLen */
+	SET_UINT8(buffer, 6, 1); /* CodecChannelT.codecId */
+	SET_UINT8(buffer, 7, 0); /* CodecChannelT.channelId */
+	SET_UINT32(buffer, 8, context->frame_idx); /* frameIdx */
+	SET_UINT16(buffer, 12, 1); /* numRegions */
+
+	return 14;
+}
+
+static int
+rfx_compose_message_region(RFX_CONTEXT * context, uint8 * buffer, int buffer_size,
+	const RFX_RECT * rects, int num_rects)
+{
+	int size;
+	int i;
+
+	if (buffer_size < 15 + num_rects * 8)
+	{
+		printf("rfx_compose_message_region: buffer size too small.\n");
+		return 0;
+	}
+
+	SET_UINT16(buffer, 0, WBT_REGION); /* CodecChannelT.blockType */
+	/* set CodecChannelT.blockLen later */
+	SET_UINT8(buffer, 6, 1); /* CodecChannelT.codecId */
+	SET_UINT8(buffer, 7, 0); /* CodecChannelT.channelId */
+	SET_UINT8(buffer, 8, 1); /* regionFlags */
+	SET_UINT16(buffer, 9, num_rects); /* numRects */
+	size = 11;
+
+	for (i = 0; i < num_rects; i++)
+	{
+		SET_UINT16(buffer, size, rects[i].x);
+		SET_UINT16(buffer, size + 2, rects[i].y);
+		SET_UINT16(buffer, size + 4, rects[i].width);
+		SET_UINT16(buffer, size + 6, rects[i].height);
+		size += 8;
+	}
+
+	SET_UINT16(buffer, size, CBT_REGION); /* regionType */
+	SET_UINT16(buffer, size + 2, 1); /* numTilesets */
+	size += 4;
+
+	SET_UINT32(buffer, 2, size); /* CodecChannelT.blockLen */
+	return size;
+}
+
+static int
+rfx_compose_message_tile(RFX_CONTEXT * context, uint8 * buffer, int buffer_size,
+	uint8 * tile_data, int tile_width, int tile_height, int rowstride,
+	const uint32 * quantVals, int quantIdxY, int quantIdxCb, int quantIdxCr, int xIdx, int yIdx)
+{
+	int YLen = 0;
+	int CbLen = 0;
+	int CrLen = 0;
+	int size;
+
+	if (buffer_size < 19)
+	{
+		printf("rfx_compose_message_tile: buffer size too small.\n");
+		return 0;
+	}
+
+	SET_UINT16(buffer, 0, CBT_TILE); /* BlockT.blockType */
+	/* set BlockT.blockLen later */
+	SET_UINT8(buffer, 6, quantIdxY); /* quantIdxY */
+	SET_UINT8(buffer, 7, quantIdxCb); /* quantIdxCb */
+	SET_UINT8(buffer, 8, quantIdxCr); /* quantIdxCr */
+	SET_UINT16(buffer, 9, xIdx); /* xIdx */
+	SET_UINT16(buffer, 11, yIdx); /* yIdx */
+
+	rfx_encode_rgb(context, tile_data, tile_width, tile_height, rowstride,
+		quantVals + quantIdxY * 10, quantVals + quantIdxCb * 10, quantVals + quantIdxCr * 10,
+		buffer + 19, buffer_size - 19, &YLen, &CbLen, &CrLen);
+
+	DEBUG_RFX("xIdx=%d yIdx=%d width=%d height=%d YLen=%d CbLen=%d CrLen=%d",
+		xIdx, yIdx, tile_width, tile_height, YLen, CbLen, CrLen);
+
+	SET_UINT16(buffer, 13, YLen); /* YLen */
+	SET_UINT16(buffer, 15, CbLen); /* CbLen */
+	SET_UINT16(buffer, 17, CrLen); /* CrLen */
+	size = 19 + YLen + CbLen + CrLen;
+	SET_UINT32(buffer, 2, size); /* BlockT.blockLen */
+
+	return size;
+}
+
+static int
+rfx_compose_message_tileset(RFX_CONTEXT * context, uint8 * buffer, int buffer_size,
+	uint8 * image_data, int width, int height, int rowstride)
+{
+	int size;
+	int i;
+	int numQuants;
+	const uint32 * quantVals;
+	const uint32 * quantValsPtr;
+	int quantIdxY;
+	int quantIdxCb;
+	int quantIdxCr;
+	int numTiles;
+	int numTilesX;
+	int numTilesY;
+	int xIdx;
+	int yIdx;
+	int tilesDataSize;
+
+	if (context->num_quants == 0)
+	{
+		numQuants = 1;
+		quantVals = rfx_default_quantization_values;
+		quantIdxY = 0;
+		quantIdxCb = 0;
+		quantIdxCr = 0;
+	}
+	else
+	{
+		numQuants = context->num_quants;
+		quantVals = context->quants;
+		quantIdxY = context->quant_idx_y;
+		quantIdxCb = context->quant_idx_cb;
+		quantIdxCr = context->quant_idx_cr;
+	}
+
+	numTilesX = (width + 63) / 64;
+	numTilesY = (height + 63) / 64;
+	numTiles = numTilesX * numTilesY;
+
+	if (buffer_size < 22 + numQuants * 5)
+	{
+		printf("rfx_compose_message_tileset: buffer size too small.\n");
+		return 0;
+	}
+
+	SET_UINT16(buffer, 0, WBT_EXTENSION); /* CodecChannelT.blockType */
+	/* set CodecChannelT.blockLen later */
+	SET_UINT8(buffer, 6, 1); /* CodecChannelT.codecId */
+	SET_UINT8(buffer, 7, 0); /* CodecChannelT.channelId */
+	SET_UINT16(buffer, 8, CBT_TILESET); /* subtype */
+	SET_UINT16(buffer, 10, 0); /* idx */
+	SET_UINT16(buffer, 12, context->properties); /* properties */
+	SET_UINT8(buffer, 14, numQuants); /* numQuants */
+	SET_UINT8(buffer, 15, 0x40); /* tileSize */
+	SET_UINT16(buffer, 16, numTiles); /* numTiles */
+	/* set tilesDataSize later */
+	size = 22;
+
+	quantValsPtr = quantVals;
+	for (i = 0; i < numQuants * 5; i++)
+	{
+		SET_UINT8(buffer, size, quantValsPtr[0] + (quantValsPtr[1] << 4));
+		quantValsPtr += 2;
+		size++;
+	}
+
+	DEBUG_RFX("width:%d height:%d rowstride:%d", width, height, rowstride);
+
+	tilesDataSize = 0;
+	for (yIdx = 0; yIdx < numTilesY; yIdx++)
+	{
+		for (xIdx = 0; xIdx < numTilesX; xIdx++)
+		{
+			tilesDataSize += rfx_compose_message_tile(context,
+				buffer + size + tilesDataSize, buffer_size - size - tilesDataSize,
+				image_data + yIdx * 64 * rowstride + xIdx * 64 * context->bytes_per_pixel,
+				xIdx < numTilesX - 1 ? 64 : width - xIdx * 64,
+				yIdx < numTilesY - 1 ? 64 : height - yIdx * 64,
+				rowstride, quantVals, quantIdxY, quantIdxCb, quantIdxCr, xIdx, yIdx);
+		}
+	}
+
+	size += tilesDataSize;
+	SET_UINT32(buffer, 2, size); /* CodecChannelT.blockLen */
+	SET_UINT32(buffer, 18, tilesDataSize); /* tilesDataSize */
+
+	return size;
+}
+
+static int
+rfx_compose_message_frame_end(RFX_CONTEXT * context, uint8 * buffer, int buffer_size)
+{
+	if (buffer_size < 8)
+	{
+		printf("rfx_compose_message_frame_end: buffer size too small.\n");
+		return 0;
+	}
+
+	SET_UINT16(buffer, 0, WBT_FRAME_END); /* CodecChannelT.blockType */
+	SET_UINT32(buffer, 2, 8); /* CodecChannelT.blockLen */
+	SET_UINT8(buffer, 6, 1); /* CodecChannelT.codecId */
+	SET_UINT8(buffer, 7, 0); /* CodecChannelT.channelId */
+
+	return 8;
+}
+
+int
+rfx_compose_message_data(RFX_CONTEXT * context, uint8 * buffer, int buffer_size,
+	const RFX_RECT * rects, int num_rects, uint8 * image_data, int width, int height, int rowstride)
+{
+	int composed_size;
+
+	composed_size = rfx_compose_message_frame_begin(context, buffer, buffer_size);
+	composed_size += rfx_compose_message_region(context, buffer + composed_size, buffer_size - composed_size,
+		rects, num_rects);
+	composed_size += rfx_compose_message_tileset(context, buffer + composed_size, buffer_size - composed_size,
+		image_data, width, height, rowstride);
+	composed_size += rfx_compose_message_frame_end(context, buffer + composed_size, buffer_size - composed_size);
+
+	return composed_size;
+}
diff --git a/libfreerdp-rfx/librfx.h b/libfreerdp-rfx/librfx.h
index 3c18474..87ea742 100644
--- a/libfreerdp-rfx/librfx.h
+++ b/libfreerdp-rfx/librfx.h
@@ -32,6 +32,10 @@
 #include "rfx_sse.h"
 #endif
 
+#ifdef WITH_NEON
+#include "rfx_neon.h"
+#endif
+
 #ifndef RFX_INIT_SIMD
 #define RFX_INIT_SIMD(_rfx_context) do { } while (0)
 #endif
diff --git a/libfreerdp-rfx/neon/Makefile.am b/libfreerdp-rfx/neon/Makefile.am
new file mode 100644
index 0000000..12d4921
--- /dev/null
+++ b/libfreerdp-rfx/neon/Makefile.am
@@ -0,0 +1,25 @@
+## Process this file with automake to produce Makefile.in
+
+# libfreerdp-rfx-neon
+noinst_LTLIBRARIES = libfreerdp-rfx-neon.la
+
+libfreerdp_rfx_neon_la_SOURCES =
+
+if WITH_NEON
+libfreerdp_rfx_neon_la_SOURCES += \
+	rfx_neon.c rfx_neon.h
+endif
+
+libfreerdp_rfx_neon_la_CFLAGS = \
+	-I$(top_srcdir) \
+	-I$(top_srcdir)/include \
+	-I..
+
+libfreerdp_rfx_neon_la_LDFLAGS =
+
+libfreerdp_rfx_neon_la_LIBDADD =
+
+# extra
+EXTRA_DIST =
+
+DISTCLEANFILES = 
diff --git a/libfreerdp-rfx/neon/rfx_neon.c b/libfreerdp-rfx/neon/rfx_neon.c
new file mode 100644
index 0000000..745f9ca
--- /dev/null
+++ b/libfreerdp-rfx/neon/rfx_neon.c
@@ -0,0 +1,372 @@
+/*
+   FreeRDP: A Remote Desktop Protocol client.
+   RemoteFX Codec Library - NEON Optimizations
+
+   Copyright 2011 Marc-Andre Moreau <marcandre.moreau@gmail.com>
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <arm_neon.h>
+
+#include "rfx_neon.h"
+
+#if defined(ANDROID)
+#include <cpu-features.h>
+#endif
+
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+prefetch_data(void * buffer1)
+{
+	asm("  pld [%0, #64]		\t\n"
+		: // no output
+		: "r" (buffer1)
+		);
+}
+
+void rfx_decode_YCbCr_to_RGB_NEON(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer)
+{
+	int16x8_t zero = vdupq_n_s16(0);
+	int16x8_t max = vdupq_n_s16(255);
+	int16x8_t y_add = vdupq_n_s16(128);
+
+	int16x8_t* y_r_buf = (int16x8_t*)y_r_buffer;
+	int16x8_t* cb_g_buf = (int16x8_t*)cb_g_buffer;
+	int16x8_t* cr_b_buf = (int16x8_t*)cr_b_buffer;
+
+	int i;
+	for (i = 0; i < 4096 / 8; i++)
+	{
+		prefetch_data(&y_r_buf[i]);
+		prefetch_data(&cr_b_buf[i]);
+		prefetch_data(&cb_g_buf[i]);
+
+		int16x8_t y = vld1q_s16((sint16*)&y_r_buf[i]);
+		y = vaddq_s16(y, y_add);
+
+		int16x8_t cr = vld1q_s16((sint16*)&cr_b_buf[i]);
+
+		// r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)), 0, 255);
+		int16x8_t r = vaddq_s16(y, cr);
+		r = vaddq_s16(r, vshrq_n_s16(cr, 2));
+		r = vaddq_s16(r, vshrq_n_s16(cr, 3));
+		r = vaddq_s16(r, vshrq_n_s16(cr, 5));
+		r = vminq_s16(vmaxq_s16(r, zero), max);
+		vst1q_s16((sint16*)&y_r_buf[i], r);
+
+		// cb = cb_g_buf[i];
+		int16x8_t cb = vld1q_s16((sint16*)&cb_g_buf[i]);
+
+		// g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255);
+		int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2));
+		g = vsubq_s16(g, vshrq_n_s16(cb, 4));
+		g = vsubq_s16(g, vshrq_n_s16(cb, 5));
+		g = vsubq_s16(g, vshrq_n_s16(cr, 1));
+		g = vsubq_s16(g, vshrq_n_s16(cr, 3));
+		g = vsubq_s16(g, vshrq_n_s16(cr, 4));
+		g = vsubq_s16(g, vshrq_n_s16(cr, 5));
+		g = vminq_s16(vmaxq_s16(g, zero), max);
+		vst1q_s16((sint16*)&cb_g_buf[i], g);
+
+		// b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)), 0, 255);
+		int16x8_t b = vaddq_s16(y, cb);
+		b = vaddq_s16(b, vshrq_n_s16(cb, 1));
+		b = vaddq_s16(b, vshrq_n_s16(cb, 2));
+		b = vaddq_s16(b, vshrq_n_s16(cb, 6));
+		b = vminq_s16(vmaxq_s16(b, zero), max);
+		vst1q_s16((sint16*)&cr_b_buf[i], b);
+	}
+}
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_quantization_decode_block_NEON(sint16 * buffer, const int buffer_size, const uint32 factor)
+{
+	if (factor <= 6)
+		return;
+	int16x8_t quantFactors = vdupq_n_s16(factor - 6);
+	int16x8_t* buf = (int16x8_t*)buffer;
+	int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
+
+	do
+	{
+		prefetch_data(buf);
+		int16x8_t val = vld1q_s16((sint16*)buf);
+		val = vshlq_s16(val, quantFactors);
+		vst1q_s16((sint16*)buf, val);
+		buf++;
+	}
+	while(buf < buf_end);
+}
+
+void
+rfx_quantization_decode_NEON(sint16 * buffer, const uint32 * quantization_values)
+{
+	rfx_quantization_decode_block_NEON(buffer, 1024, quantization_values[8]); /* HL1 */
+	rfx_quantization_decode_block_NEON(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
+	rfx_quantization_decode_block_NEON(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
+	rfx_quantization_decode_block_NEON(buffer + 3072, 256, quantization_values[5]); /* HL2 */
+	rfx_quantization_decode_block_NEON(buffer + 3328, 256, quantization_values[4]); /* LH2 */
+	rfx_quantization_decode_block_NEON(buffer + 3584, 256, quantization_values[6]); /* HH2 */
+	rfx_quantization_decode_block_NEON(buffer + 3840, 64, quantization_values[2]); /* HL3 */
+	rfx_quantization_decode_block_NEON(buffer + 3904, 64, quantization_values[1]); /* LH3 */
+	rfx_quantization_decode_block_NEON(buffer + 3868, 64, quantization_values[3]); /* HH3 */
+	rfx_quantization_decode_block_NEON(buffer + 4032, 64, quantization_values[0]); /* LL3 */
+}
+
+
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_dwt_2d_decode_block_horiz_NEON(sint16 * l, sint16 * h, sint16 * dst, int subband_width)
+{
+	int y, n;
+	sint16 * l_ptr = l;
+	sint16 * h_ptr = h;
+	sint16 * dst_ptr = dst;
+
+	for (y = 0; y < subband_width; y++)
+	{
+		/* Even coefficients */
+		for (n = 0; n < subband_width; n+=8)
+		{
+			// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
+			int16x8_t l_n = vld1q_s16(l_ptr);
+			prefetch_data(l_ptr);
+
+			int16x8_t h_n = vld1q_s16(h_ptr);
+			int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
+			prefetch_data(h_ptr);
+
+			if (n == 0)
+			{
+				int16_t first = vgetq_lane_s16(h_n_m, 1);
+				h_n_m = vsetq_lane_s16(first, h_n_m, 0);
+			}
+
+			int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
+			tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
+			tmp_n = vshrq_n_s16(tmp_n, 1);
+
+			int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
+
+			vst1q_s16(l_ptr, dst_n);
+
+			l_ptr+=8;
+			h_ptr+=8;
+		}
+		l_ptr -= subband_width;
+		h_ptr -= subband_width;
+
+		/* Odd coefficients */
+		for (n = 0; n < subband_width; n+=8)
+		{
+			// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
+
+			int16x8_t h_n = vld1q_s16(h_ptr);
+			prefetch_data(h_ptr);
+
+			h_n = vshlq_n_s16(h_n, 1);
+
+			int16x8x2_t dst_n;
+			dst_n.val[0] = vld1q_s16(l_ptr);
+			prefetch_data(l_ptr);
+			int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
+			if (n == subband_width - 8)
+			{
+				int16_t last = vgetq_lane_s16(dst_n_p, 6);
+				dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
+			}
+
+			dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
+			dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
+
+			dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
+
+			vst2q_s16(dst_ptr, dst_n);
+			prefetch_data(dst_ptr);
+
+			l_ptr+=8;
+			h_ptr+=8;
+			dst_ptr+=16;
+		}
+	}
+}
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_dwt_2d_decode_block_vert_NEON(sint16 * l, sint16 * h, sint16 * dst, int subband_width)
+{
+	int x, n;
+	sint16 * l_ptr = l;
+	sint16 * h_ptr = h;
+	sint16 * dst_ptr = dst;
+
+	int total_width = subband_width + subband_width;
+
+	/* Even coefficients */
+	for (n = 0; n < subband_width; n++)
+	{
+		for (x = 0; x < total_width; x+=8)
+		{
+			prefetch_data(l_ptr);
+			prefetch_data(h_ptr);
+			// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
+
+			int16x8_t l_n = vld1q_s16(l_ptr);
+			int16x8_t h_n = vld1q_s16(h_ptr);
+
+			int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));;
+			if (n == 0)
+				tmp_n = vaddq_s16(tmp_n, h_n);
+			else
+			{
+				int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
+				tmp_n = vaddq_s16(tmp_n, h_n_m);
+			}
+			tmp_n = vshrq_n_s16(tmp_n, 1);
+
+			prefetch_data(dst_ptr);
+			int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
+			vst1q_s16(dst_ptr, dst_n);
+
+			l_ptr+=8;
+			h_ptr+=8;
+			dst_ptr+=8;
+		}
+		dst_ptr+=total_width;
+	}
+
+	h_ptr = h;
+	dst_ptr = dst + total_width;
+
+	/* Odd coefficients */
+	for (n = 0; n < subband_width; n++)
+	{
+		for (x = 0; x < total_width; x+=8)
+		{
+			// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
+			int16x8_t h_n = vld1q_s16(h_ptr);
+			int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
+
+			prefetch_data(h_ptr);
+			prefetch_data(dst_ptr - total_width);
+
+			h_n = vshlq_n_s16(h_n, 1);
+
+			int16x8_t tmp_n = dst_n_m;
+			if (n == subband_width - 1)
+				tmp_n = vaddq_s16(tmp_n, dst_n_m);
+			else
+			{
+				int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
+				tmp_n = vaddq_s16(tmp_n, dst_n_p);
+			}
+			tmp_n = vshrq_n_s16(tmp_n, 1);
+
+			int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
+			vst1q_s16(dst_ptr, dst_n);
+			prefetch_data(dst_ptr);
+
+			h_ptr+=8;
+			dst_ptr+=8;
+		}
+		dst_ptr+=total_width;
+	}
+}
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_dwt_2d_decode_block_NEON(sint16 * buffer, sint16 * idwt, int subband_width)
+{
+	sint16 * hl, * lh, * hh, * ll;
+	sint16 * l_dst, * h_dst;
+
+	/* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */
+	/* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
+	/* The lower part L uses LL(3) and HL(0). */
+	/* The higher part H uses LH(1) and HH(2). */
+
+	ll = buffer + subband_width * subband_width * 3;
+	hl = buffer;
+	l_dst = idwt;
+
+	rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
+
+	lh = buffer + subband_width * subband_width;
+	hh = buffer + subband_width * subband_width * 2;
+	h_dst = idwt + subband_width * subband_width * 2;
+
+	rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
+
+	/* Inverse DWT in vertical direction, results are stored in original buffer. */
+	rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
+}
+
+void
+rfx_dwt_2d_decode_NEON(sint16 * buffer, sint16 * dwt_buffer)
+{
+	rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
+	rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
+	rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
+}
+
+
+
+int isNeonSupported()
+{
+#if defined(ANDROID)
+	if (android_getCpuFamily() != ANDROID_CPU_FAMILY_ARM)
+	{
+		DEBUG_RFX("NEON optimization disabled - No ARM CPU found");
+		return 0;
+	}
+
+	uint64_t features = android_getCpuFeatures();
+	if ((features & ANDROID_CPU_ARM_FEATURE_ARMv7))
+	{
+		if (features & ANDROID_CPU_ARM_FEATURE_NEON)
+		{
+			DEBUG_RFX("NEON optimization enabled!");
+			return 1;
+		}
+		DEBUG_RFX("NEON optimization disabled - CPU not NEON capable");
+	}
+	else
+		DEBUG_RFX("NEON optimization disabled - No ARMv7 CPU found");
+
+    return 0;
+#else
+    return 1; // TODO: set to 1 for development
+//    return 0;
+#endif
+}
+
+
+void rfx_init_neon(RFX_CONTEXT * context)
+{
+
+
+	if(isNeonSupported())
+	{
+		DEBUG_RFX("Using NEON optimizations");
+
+		IF_PROFILER(context->prof_rfx_decode_YCbCr_to_RGB->name = "rfx_decode_YCbCr_to_RGB_NEON");
+		IF_PROFILER(context->prof_rfx_quantization_decode->name = "rfx_quantization_decode_NEON");
+		IF_PROFILER(context->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_NEON");
+
+		context->decode_YCbCr_to_RGB = rfx_decode_YCbCr_to_RGB_NEON;
+		context->quantization_decode = rfx_quantization_decode_NEON;
+		context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
+	}
+}
diff --git a/libfreerdp-rfx/neon/rfx_neon.h b/libfreerdp-rfx/neon/rfx_neon.h
new file mode 100644
index 0000000..64b702f
--- /dev/null
+++ b/libfreerdp-rfx/neon/rfx_neon.h
@@ -0,0 +1,32 @@
+/*
+   FreeRDP: A Remote Desktop Protocol client.
+   RemoteFX Codec Library - NEON Optimizations
+
+   Copyright 2011 Marc-Andre Moreau <marcandre.moreau@gmail.com>
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#ifndef __RFX_NEON_H
+#define __RFX_NEON_H
+
+#include "librfx.h"
+#include <freerdp/rfx.h>
+
+void rfx_init_neon(RFX_CONTEXT * context);
+
+#ifndef RFX_INIT_SIMD
+#define RFX_INIT_SIMD(_rfx_context) rfx_init_neon(_rfx_context)
+#endif
+
+#endif /* __RFX_NEON_H */
diff --git a/libfreerdp-rfx/rfx_bitstream.c b/libfreerdp-rfx/rfx_bitstream.c
index ce741fc..a62ec17 100644
--- a/libfreerdp-rfx/rfx_bitstream.c
+++ b/libfreerdp-rfx/rfx_bitstream.c
@@ -43,11 +43,11 @@ rfx_bitstream_put_buffer(RFX_BITSTREAM * bs, uint8 * buffer, int nbytes)
 	bs->bits_left = 8;
 }
 
-uint32
+uint16
 rfx_bitstream_get_bits(RFX_BITSTREAM * bs, int nbits)
 {
 	int b;
-	uint32 n = 0;
+	uint16 n = 0;
 
 	while (bs->byte_pos < bs->nbytes && nbits > 0)
 	{
@@ -73,6 +73,30 @@ rfx_bitstream_get_bits(RFX_BITSTREAM * bs, int nbits)
 	return n;
 }
 
+void
+rfx_bitstream_put_bits(RFX_BITSTREAM * bs, uint16 bits, int nbits)
+{
+	int b;
+
+	while (bs->byte_pos < bs->nbytes && nbits > 0)
+	{
+		b = nbits;
+
+		if (b > bs->bits_left)
+			b = bs->bits_left;
+
+		bs->buffer[bs->byte_pos] |= ((bits >> (nbits - b)) & ((1 << b) - 1)) << (bs->bits_left - b);
+		bs->bits_left -= b;
+		nbits -= b;
+
+		if (bs->bits_left == 0)
+		{
+			bs->bits_left = 8;
+			bs->byte_pos++;
+		}
+	}
+}
+
 int
 rfx_bitstream_eos(RFX_BITSTREAM * bs)
 {
@@ -88,6 +112,12 @@ rfx_bitstream_left(RFX_BITSTREAM * bs)
 	return (bs->nbytes - bs->byte_pos - 1) * 8 + bs->bits_left;
 }
 
+int
+rfx_bitstream_get_processed_bytes(RFX_BITSTREAM * bs)
+{
+	return (bs->bits_left < 8 ? bs->byte_pos + 1 : bs->byte_pos);
+}
+
 void
 rfx_bitstream_free(RFX_BITSTREAM * bs)
 {
diff --git a/libfreerdp-rfx/rfx_bitstream.h b/libfreerdp-rfx/rfx_bitstream.h
index 93d12b7..191c191 100644
--- a/libfreerdp-rfx/rfx_bitstream.h
+++ b/libfreerdp-rfx/rfx_bitstream.h
@@ -35,12 +35,16 @@ RFX_BITSTREAM *
 rfx_bitstream_new(void);
 void
 rfx_bitstream_put_buffer(RFX_BITSTREAM * bs, uint8 * buffer, int nbytes);
-unsigned int
+uint16
 rfx_bitstream_get_bits(RFX_BITSTREAM * bs, int nbits);
+void
+rfx_bitstream_put_bits(RFX_BITSTREAM * bs, uint16 bits, int nbits);
 int
 rfx_bitstream_eos(RFX_BITSTREAM * bs);
 int
 rfx_bitstream_left(RFX_BITSTREAM * bs);
+int
+rfx_bitstream_get_processed_bytes(RFX_BITSTREAM * bs);
 void
 rfx_bitstream_free(RFX_BITSTREAM * bs);
 
diff --git a/libfreerdp-rfx/rfx_decode.c b/libfreerdp-rfx/rfx_decode.c
index 35dc306..8dc8853 100644
--- a/libfreerdp-rfx/rfx_decode.c
+++ b/libfreerdp-rfx/rfx_decode.c
@@ -27,13 +27,64 @@
 
 #include "rfx_decode.h"
 
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_decode_format_RGB(sint16 * r_buf, sint16 * g_buf, sint16 * b_buf,
+	RFX_PIXEL_FORMAT pixel_format, uint8 * dst_buf)
+{
+	sint16 * r = r_buf;
+	sint16 * g = g_buf;
+	sint16 * b = b_buf;
+	uint8 * dst = dst_buf;
+	int i;
+	
+	switch (pixel_format)
+	{
+		case RFX_PIXEL_FORMAT_BGRA:
+			for (i = 0; i < 4096; i++)
+			{
+				*dst++ = (uint8) (*b++);
+				*dst++ = (uint8) (*g++);
+				*dst++ = (uint8) (*r++);
+				*dst++ = 0xFF;
+			}
+			break;
+		case RFX_PIXEL_FORMAT_RGBA:
+			for (i = 0; i < 4096; i++)
+			{
+				*dst++ = (uint8) (*r++);
+				*dst++ = (uint8) (*g++);
+				*dst++ = (uint8) (*b++);
+				*dst++ = 0xFF;
+			}
+			break;
+		case RFX_PIXEL_FORMAT_BGR:
+			for (i = 0; i < 4096; i++)
+			{
+				*dst++ = (uint8) (*b++);
+				*dst++ = (uint8) (*g++);
+				*dst++ = (uint8) (*r++);
+			}
+			break;
+		case RFX_PIXEL_FORMAT_RGB:
+			for (i = 0; i < 4096; i++)
+			{
+				*dst++ = (uint8) (*r++);
+				*dst++ = (uint8) (*g++);
+				*dst++ = (uint8) (*b++);
+			}
+			break;
+		default:
+			break;
+	}
+}
+
 #define MINMAX(_v,_l,_h) ((_v) < (_l) ? (_l) : ((_v) > (_h) ? (_h) : (_v)))
 
 void
-rfx_decode_YCbCr_to_RGB(uint32 * y_r_buf, uint32 * cb_g_buf, uint32 * cr_b_buf)
+rfx_decode_YCbCr_to_RGB(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf)
 {
-	int y, cb, cr;
-	int r, g, b;
+	sint16 y, cb, cr;
+	sint16 r, g, b;
 
 	int i;
 	for (i = 0; i < 4096; i++)
@@ -51,29 +102,28 @@ rfx_decode_YCbCr_to_RGB(uint32 * y_r_buf, uint32 * cb_g_buf, uint32 * cr_b_buf)
 }
 
 static void
-rfx_decode_component(RFX_CONTEXT * context, const uint32 * quantization_values, int half,
-	const uint8 * data, int size, uint32 * buffer)
+rfx_decode_component(RFX_CONTEXT * context, const uint32 * quantization_values,
+	const uint8 * data, int size, sint16 * buffer)
 {
-	rfx_rlgr_decode(context->mode, data, size, buffer, 4096);
-
-	rfx_differential_decode(buffer + 4032, 64);
-
-	rfx_quantization_decode(buffer, 1024, quantization_values[8]); /* HL1 */
-	rfx_quantization_decode(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
-	rfx_quantization_decode(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
-	rfx_quantization_decode(buffer + 3072, 256, quantization_values[5]); /* HL2 */
-	rfx_quantization_decode(buffer + 3328, 256, quantization_values[4]); /* LH2 */
-	rfx_quantization_decode(buffer + 3584, 256, quantization_values[6]); /* HH2 */
-	rfx_quantization_decode(buffer + 3840, 64, quantization_values[2]); /* HL3 */
-	rfx_quantization_decode(buffer + 3904, 64, quantization_values[1]); /* LH3 */
-	rfx_quantization_decode(buffer + 3868, 64, quantization_values[3]); /* HH3 */
-	rfx_quantization_decode(buffer + 4032, 64, quantization_values[0]); /* LL3 */
-
-	rfx_dwt_2d_decode(context, (int*) buffer + 3840, 8);
-	rfx_dwt_2d_decode(context, (int*) buffer + 3072, 16);
-
-	if (!half)
-		rfx_dwt_2d_decode(context, (int*) buffer, 32);
+	PROFILER_ENTER(context->prof_rfx_decode_component);
+
+	PROFILER_ENTER(context->prof_rfx_rlgr_decode);
+		rfx_rlgr_decode(context->mode, data, size, buffer, 4096);
+	PROFILER_EXIT(context->prof_rfx_rlgr_decode);
+
+	PROFILER_ENTER(context->prof_rfx_differential_decode);
+		rfx_differential_decode(buffer + 4032, 64);
+	PROFILER_EXIT(context->prof_rfx_differential_decode);
+
+	PROFILER_ENTER(context->prof_rfx_quantization_decode);
+		context->quantization_decode(buffer, quantization_values);
+	PROFILER_EXIT(context->prof_rfx_quantization_decode);
+
+	PROFILER_ENTER(context->prof_rfx_dwt_2d_decode);
+		context->dwt_2d_decode(buffer, context->dwt_buffer);
+	PROFILER_EXIT(context->prof_rfx_dwt_2d_decode);
+
+	PROFILER_EXIT(context->prof_rfx_decode_component);
 }
 
 uint8*
@@ -82,49 +132,22 @@ rfx_decode_rgb(RFX_CONTEXT * context,
 	const uint8 * cb_data, int cb_size, const uint32 * cb_quants,
 	const uint8 * cr_data, int cr_size, const uint32 * cr_quants, uint8* rgb_buffer)
 {
-	int i;
-	uint8 * dst;
-	int r, g, b;
+	PROFILER_ENTER(context->prof_rfx_decode_rgb);
 
-	dst = rgb_buffer;
-	rfx_decode_component(context, y_quants, 0, y_data, y_size, context->y_r_buffer);
-	rfx_decode_component(context, cb_quants, 0, cb_data, cb_size, context->cb_g_buffer);
-	rfx_decode_component(context, cr_quants, 0, cr_data, cr_size, context->cr_b_buffer);
+	rfx_decode_component(context, y_quants, y_data, y_size, context->y_r_buffer); /* YData */
+	rfx_decode_component(context, cb_quants, cb_data, cb_size, context->cb_g_buffer); /* CbData */
+	rfx_decode_component(context, cr_quants, cr_data, cr_size, context->cr_b_buffer); /* CrData */
 
-	context->decode_YCbCr_to_RGB(context->y_r_buffer, context->cb_g_buffer, context->cr_b_buffer);
+	PROFILER_ENTER(context->prof_rfx_decode_YCbCr_to_RGB);
+		context->decode_YCbCr_to_RGB(context->y_r_buffer, context->cb_g_buffer, context->cr_b_buffer);
+	PROFILER_EXIT(context->prof_rfx_decode_YCbCr_to_RGB);
+
+	PROFILER_ENTER(context->prof_rfx_decode_format_RGB);
+		rfx_decode_format_RGB(context->y_r_buffer, context->cb_g_buffer, context->cr_b_buffer,
+			context->pixel_format, rgb_buffer);
+	PROFILER_EXIT(context->prof_rfx_decode_format_RGB);
+	
+	PROFILER_EXIT(context->prof_rfx_decode_rgb);
 
-	for (i = 0; i < 4096; i++)
-	{
-		r = context->y_r_buffer[i];
-		g = context->cb_g_buffer[i];
-		b = context->cr_b_buffer[i];
-		switch (context->pixel_format)
-		{
-			case RFX_PIXEL_FORMAT_BGRA:
-				*dst++ = (uint8) (b);
-				*dst++ = (uint8) (g);
-				*dst++ = (uint8) (r);
-				*dst++ = 0xFF;
-				break;
-			case RFX_PIXEL_FORMAT_RGBA:
-				*dst++ = (uint8) (r);
-				*dst++ = (uint8) (g);
-				*dst++ = (uint8) (b);
-				*dst++ = 0xFF;
-				break;
-			case RFX_PIXEL_FORMAT_BGR:
-				*dst++ = (uint8) (b);
-				*dst++ = (uint8) (g);
-				*dst++ = (uint8) (r);
-				break;
-			case RFX_PIXEL_FORMAT_RGB:
-				*dst++ = (uint8) (r);
-				*dst++ = (uint8) (g);
-				*dst++ = (uint8) (b);
-				break;
-			default:
-				break;
-		}
-	}
 	return rgb_buffer;
 }
diff --git a/libfreerdp-rfx/rfx_decode.h b/libfreerdp-rfx/rfx_decode.h
index 356e232..3b7fd9b 100644
--- a/libfreerdp-rfx/rfx_decode.h
+++ b/libfreerdp-rfx/rfx_decode.h
@@ -23,7 +23,7 @@
 #include <freerdp/rfx.h>
 
 void
-rfx_decode_YCbCr_to_RGB(uint32 * y_r_buf, uint32 * cb_g_buf, uint32 * cr_b_buf);
+rfx_decode_YCbCr_to_RGB(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf);
 
 unsigned char *
 rfx_decode_rgb(RFX_CONTEXT * context,
diff --git a/libfreerdp-rfx/rfx_differential.c b/libfreerdp-rfx/rfx_differential.c
index 10b9dd6..d1b3eb1 100644
--- a/libfreerdp-rfx/rfx_differential.c
+++ b/libfreerdp-rfx/rfx_differential.c
@@ -23,10 +23,10 @@
 #include "rfx_differential.h"
 
 void
-rfx_differential_decode(uint32 * buffer, int buffer_size)
+rfx_differential_decode(sint16 * buffer, int buffer_size)
 {
-	uint32 * src;
-	uint32 * dst;
+	sint16 * src;
+	sint16 * dst;
 
 	for (src = buffer, dst = buffer + 1; buffer_size > 1; src++, dst++, buffer_size--)
 	{
@@ -34,3 +34,17 @@ rfx_differential_decode(uint32 * buffer, int buffer_size)
 	}
 }
 
+void
+rfx_differential_encode(sint16 * buffer, int buffer_size)
+{
+	sint16 n1, n2;
+	sint16 * dst;
+
+	for (n1 = *buffer, dst = buffer + 1; buffer_size > 1; dst++, buffer_size--)
+	{
+		n2 = *dst;
+		*dst -= n1;
+		n1 = n2;
+	}
+}
+
diff --git a/libfreerdp-rfx/rfx_differential.h b/libfreerdp-rfx/rfx_differential.h
index fc5deb5..141e8c9 100644
--- a/libfreerdp-rfx/rfx_differential.h
+++ b/libfreerdp-rfx/rfx_differential.h
@@ -23,7 +23,9 @@
 #include <freerdp/rfx.h>
 
 void
-rfx_differential_decode(uint32 * buffer, int buffer_size);
+rfx_differential_decode(sint16 * buffer, int buffer_size);
+void
+rfx_differential_encode(sint16 * buffer, int buffer_size);
 
 #endif
 
diff --git a/libfreerdp-rfx/rfx_dwt.c b/libfreerdp-rfx/rfx_dwt.c
index 3ed4e89..7f80975 100644
--- a/libfreerdp-rfx/rfx_dwt.c
+++ b/libfreerdp-rfx/rfx_dwt.c
@@ -24,32 +24,15 @@
 #include "rfx_dwt.h"
 
 void
-rfx_dwt_2d_decode(RFX_CONTEXT * context, int * buffer, int subband_width)
+rfx_dwt_2d_decode_block(sint16 * buffer, sint16 * idwt, int subband_width)
 {
-	int idwt_alloc;
-	int * idwt;
-	int * dst, * l, * h;
-	int * l_dst, * h_dst;
-	int * hl, * lh, * hh, * ll;
+	sint16 * dst, * l, * h;
+	sint16 * l_dst, * h_dst;
+	sint16 * hl, * lh, * hh, * ll;
 	int total_width;
 	int x, y;
 	int n;
 
-	switch (subband_width)
-	{
-		case 8:
-		case 16:
-		case 32:
-			idwt = (int*) context->idwt_buffers[subband_width >> 3];
-			idwt_alloc = 0;
-			break;
-
-		default:
-			idwt = (int*) malloc(subband_width * subband_width * 4 * sizeof(int));
-			idwt_alloc = 1;
-			break;
-	}
-
 	total_width = subband_width << 1;
 
 	/* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */
@@ -85,8 +68,8 @@ rfx_dwt_2d_decode(RFX_CONTEXT * context, int * buffer, int subband_width)
 			h_dst[x + 1] = (hh[n] << 1) + ((h_dst[x] + h_dst[x + 2]) >> 1);
 		}
 		x = n << 1;
-		l_dst[x + 1] = (hl[n] << 1) + ((l_dst[x] + l_dst[x]) >> 1);
-		h_dst[x + 1] = (hh[n] << 1) + ((h_dst[x] + h_dst[x]) >> 1);		
+		l_dst[x + 1] = (hl[n] << 1) + (l_dst[x]);
+		h_dst[x + 1] = (hh[n] << 1) + (h_dst[x]);		
 
 		ll += subband_width;
 		hl += subband_width;
@@ -120,8 +103,96 @@ rfx_dwt_2d_decode(RFX_CONTEXT * context, int * buffer, int subband_width)
 			dst[total_width] = (*h << 1) + ((dst[0] + dst[n < subband_width - 1 ? 2 * total_width : 0]) >> 1);
 		}
 	}
+}
 
-	if (idwt_alloc)
-		free(idwt);
+void
+rfx_dwt_2d_decode(sint16 * buffer, sint16 * dwt_buffer)
+{
+	rfx_dwt_2d_decode_block(buffer + 3840, dwt_buffer, 8);
+	rfx_dwt_2d_decode_block(buffer + 3072, dwt_buffer, 16);
+	rfx_dwt_2d_decode_block(buffer, dwt_buffer, 32);
 }
 
+void
+rfx_dwt_2d_encode_block(sint16 * buffer, sint16 * dwt, int subband_width)
+{
+	sint16 * src, * l, * h;
+	sint16 * l_src, * h_src;
+	sint16 * hl, * lh, * hh, * ll;
+	int total_width;
+	int x, y;
+	int n;
+
+	total_width = subband_width << 1;
+
+	/* DWT in vertical direction, results in 2 sub-bands in L, H order in tmp buffer dwt. */
+	for (x = 0; x < total_width; x++)
+	{
+		for (n = 0; n < subband_width; n++)
+		{
+			y = n << 1;
+			l = dwt + n * total_width + x;
+			h = l + subband_width * total_width;
+			src = buffer + y * total_width + x;
+
+			/* H */
+			*h = (src[total_width] - ((src[0] + src[n < subband_width - 1 ? 2 * total_width : total_width]) >> 1)) >> 1;
+
+			/* L */
+			*l = src[0] + (n == 0 ? *h : (*(h - total_width) + *h) >> 1);
+		}
+	}
+
+	/* DWT in horizontal direction, results in 4 sub-bands in HL(0), LH(1), HH(2), LL(3) order, stored in original buffer. */
+	/* The lower part L generates LL(3) and HL(0). */
+	/* The higher part H generates LH(1) and HH(2). */
+
+	ll = buffer + subband_width * subband_width * 3;
+	hl = buffer;
+	l_src = dwt;
+
+	lh = buffer + subband_width * subband_width;
+	hh = buffer + subband_width * subband_width * 2;
+	h_src = dwt + subband_width * subband_width * 2;
+
+	for (y = 0; y < subband_width; y++)
+	{
+		/* L */
+		for (n = 0; n < subband_width; n++)
+		{
+			x = n << 1;
+
+			/* HL */
+			hl[n] = (l_src[x + 1] - ((l_src[x] + l_src[n < subband_width - 1 ? x + 2 : x]) >> 1)) >> 1;
+			/* LL */
+			ll[n] = l_src[x] + (n == 0 ? hl[n] : (hl[n - 1] + hl[n]) >> 1);
+		}
+
+		/* H */
+		for (n = 0; n < subband_width; n++)
+		{
+			x = n << 1;
+
+			/* HH */
+			hh[n] = (h_src[x + 1] - ((h_src[x] + h_src[n < subband_width - 1 ? x + 2 : x]) >> 1)) >> 1;
+			/* LH */
+			lh[n] = h_src[x] + (n == 0 ? hh[n] : (hh[n - 1] + hh[n]) >> 1);
+		}
+
+		ll += subband_width;
+		hl += subband_width;
+		l_src += total_width;
+
+		lh += subband_width;
+		hh += subband_width;
+		h_src += total_width;
+	}
+}
+
+void
+rfx_dwt_2d_encode(sint16 * buffer, sint16 * dwt_buffer)
+{
+	rfx_dwt_2d_encode_block(buffer, dwt_buffer, 32);
+	rfx_dwt_2d_encode_block(buffer + 3072, dwt_buffer, 16);
+	rfx_dwt_2d_encode_block(buffer + 3840, dwt_buffer, 8);
+}
diff --git a/libfreerdp-rfx/rfx_dwt.h b/libfreerdp-rfx/rfx_dwt.h
index 21140bb..449d61c 100644
--- a/libfreerdp-rfx/rfx_dwt.h
+++ b/libfreerdp-rfx/rfx_dwt.h
@@ -23,7 +23,9 @@
 #include <freerdp/rfx.h>
 
 void
-rfx_dwt_2d_decode(RFX_CONTEXT * context, int * buffer, int subband_width);
+rfx_dwt_2d_decode(sint16 * buffer, sint16 * dwt_buffer);
+void
+rfx_dwt_2d_encode(sint16 * buffer, sint16 * dwt_buffer);
 
 #endif
 
diff --git a/libfreerdp-rfx/rfx_encode.c b/libfreerdp-rfx/rfx_encode.c
new file mode 100644
index 0000000..cd20200
--- /dev/null
+++ b/libfreerdp-rfx/rfx_encode.c
@@ -0,0 +1,182 @@
+/*
+   FreeRDP: A Remote Desktop Protocol client.
+   RemoteFX Codec Library - Encode
+
+   Copyright 2011 Vic Lee
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "rfx_rlgr.h"
+#include "rfx_differential.h"
+#include "rfx_quantization.h"
+#include "rfx_dwt.h"
+
+#include "rfx_encode.h"
+
+#define MINMAX(_v,_l,_h) ((_v) < (_l) ? (_l) : ((_v) > (_h) ? (_h) : (_v)))
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_encode_format_RGB(const uint8 * rgb_data, int width, int height, int rowstride,
+	RFX_PIXEL_FORMAT pixel_format, sint16 * r_buf, sint16 * g_buf, sint16 * b_buf)
+{
+	int x, y;
+	int x_exceed;
+	int y_exceed;
+	const uint8 * src;
+
+	x_exceed = 64 - width;
+	y_exceed = 64 - height;
+	for (y = 0; y < height; y++)
+	{
+		src = rgb_data + y * rowstride;
+
+		switch (pixel_format)
+		{
+			case RFX_PIXEL_FORMAT_BGRA:
+				for (x = 0; x < width; x++)
+				{
+					*b_buf++ = (sint16) (*src++);
+					*g_buf++ = (sint16) (*src++);
+					*r_buf++ = (sint16) (*src++);
+					src++;
+				}
+				break;
+			case RFX_PIXEL_FORMAT_RGBA:
+				for (x = 0; x < width; x++)
+				{
+					*r_buf++ = (sint16) (*src++);
+					*g_buf++ = (sint16) (*src++);
+					*b_buf++ = (sint16) (*src++);
+					src++;
+				}
+				break;
+			case RFX_PIXEL_FORMAT_BGR:
+				for (x = 0; x < width; x++)
+				{
+					*b_buf++ = (sint16) (*src++);
+					*g_buf++ = (sint16) (*src++);
+					*r_buf++ = (sint16) (*src++);
+				}
+				break;
+			case RFX_PIXEL_FORMAT_RGB:
+				for (x = 0; x < width; x++)
+				{
+					*r_buf++ = (sint16) (*src++);
+					*g_buf++ = (sint16) (*src++);
+					*b_buf++ = (sint16) (*src++);
+				}
+				break;
+			default:
+				break;
+		}
+		/* Fill the horizontal region outside of 64x64 tile size to 0 in order to be better compressed. */
+		if (x_exceed > 0)
+		{
+			memset(r_buf, 0, x_exceed * sizeof(sint16));
+			memset(g_buf, 0, x_exceed * sizeof(sint16));
+			memset(b_buf, 0, x_exceed * sizeof(sint16));
+			r_buf += x_exceed;
+			g_buf += x_exceed;
+			b_buf += x_exceed;
+		}
+	}
+
+	/* Fill the vertical region outside of 64x64 tile size to 0 in order to be better compressed. */
+	if (y_exceed > 0)
+	{
+		memset(r_buf, 0, y_exceed * 64 * sizeof(sint16));
+		memset(g_buf, 0, y_exceed * 64 * sizeof(sint16));
+		memset(b_buf, 0, y_exceed * 64 * sizeof(sint16));
+	}
+}
+
+void
+rfx_encode_RGB_to_YCbCr(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf)
+{
+	sint16 y, cb, cr;
+	sint16 r, g, b;
+
+	int i;
+	for (i = 0; i < 4096; i++)
+	{
+		r = y_r_buf[i];
+		g = cb_g_buf[i];
+		b = cr_b_buf[i];
+		y = ((r >> 2) + (r >> 5) + (r >> 6)) + ((g >> 1) + (g >> 4) + (g >> 6) + (g >> 7)) + ((b >> 4) + (b >> 5) + (b >> 6));
+		y_r_buf[i] = MINMAX(y, 0, 255) - 128;
+		cb = 0 - ((r >> 3) + (r >> 5) + (r >> 7)) - ((g >> 2) + (g >> 4) + (g >> 6)) + (b >> 1);
+		cb_g_buf[i] = MINMAX(cb, -128, 127);
+		cr = (r >> 1) - ((g >> 2) + (g >> 3) + (g >> 5) + (g >> 7)) - ((b >> 4) + (b >> 6));
+		cr_b_buf[i] = MINMAX(cr, -128, 127);
+	}
+}
+
+static void
+rfx_encode_component(RFX_CONTEXT * context, const uint32 * quantization_values,
+	sint16 * data, uint8 * buffer, int buffer_size, int * size)
+{
+	PROFILER_ENTER(context->prof_rfx_encode_component);
+
+	PROFILER_ENTER(context->prof_rfx_dwt_2d_encode);
+		context->dwt_2d_encode(data, context->dwt_buffer);
+	PROFILER_EXIT(context->prof_rfx_dwt_2d_encode);
+
+	PROFILER_ENTER(context->prof_rfx_quantization_encode);
+		context->quantization_encode(data, quantization_values);
+	PROFILER_EXIT(context->prof_rfx_quantization_encode);
+
+	PROFILER_ENTER(context->prof_rfx_differential_encode);
+		rfx_differential_encode(data + 4032, 64);
+	PROFILER_EXIT(context->prof_rfx_differential_encode);
+
+	PROFILER_ENTER(context->prof_rfx_rlgr_encode);
+		*size = rfx_rlgr_encode(context->mode, data, 4096, buffer, buffer_size);
+	PROFILER_EXIT(context->prof_rfx_rlgr_encode);
+
+	PROFILER_EXIT(context->prof_rfx_encode_component);
+}
+
+void
+rfx_encode_rgb(RFX_CONTEXT * context, const uint8 * rgb_data, int width, int height, int rowstride,
+	const uint32 * y_quants, const uint32 * cb_quants, const uint32 * cr_quants,
+	uint8 * ycbcr_buffer, int buffer_size, int * y_size, int * cb_size, int * cr_size)
+{
+	sint16 * y_r_buffer = context->y_r_buffer;
+	sint16 * cb_g_buffer = context->cb_g_buffer;
+	sint16 * cr_b_buffer = context->cr_b_buffer;
+
+	PROFILER_ENTER(context->prof_rfx_encode_rgb);
+
+	PROFILER_ENTER(context->prof_rfx_encode_format_RGB);
+		rfx_encode_format_RGB(rgb_data, width, height, rowstride,
+			context->pixel_format, y_r_buffer, cb_g_buffer, cr_b_buffer);
+	PROFILER_EXIT(context->prof_rfx_encode_format_RGB);
+
+	PROFILER_ENTER(context->prof_rfx_encode_RGB_to_YCbCr);
+		context->encode_RGB_to_YCbCr(context->y_r_buffer, context->cb_g_buffer, context->cr_b_buffer);
+	PROFILER_EXIT(context->prof_rfx_encode_RGB_to_YCbCr);
+
+	rfx_encode_component(context, y_quants, context->y_r_buffer, ycbcr_buffer, buffer_size, y_size);
+	ycbcr_buffer += (*y_size);
+	buffer_size -= (*y_size);
+	rfx_encode_component(context, cb_quants, context->cb_g_buffer, ycbcr_buffer, buffer_size, cb_size);
+	ycbcr_buffer += (*cb_size);
+	buffer_size -= (*cb_size);
+	rfx_encode_component(context, cr_quants, context->cr_b_buffer, ycbcr_buffer, buffer_size, cr_size);
+
+	PROFILER_EXIT(context->prof_rfx_encode_rgb);
+}
diff --git a/libfreerdp-rfx/rfx_encode.h b/libfreerdp-rfx/rfx_encode.h
new file mode 100644
index 0000000..2fac0be
--- /dev/null
+++ b/libfreerdp-rfx/rfx_encode.h
@@ -0,0 +1,34 @@
+/*
+   FreeRDP: A Remote Desktop Protocol client.
+   RemoteFX Codec Library - Decode
+
+   Copyright 2011 Vic Lee
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#ifndef __RFX_ENCODE_H
+#define __RFX_ENCODE_H
+
+#include <freerdp/rfx.h>
+
+void
+rfx_encode_RGB_to_YCbCr(sint16 * y_r_buf, sint16 * cb_g_buf, sint16 * cr_b_buf);
+
+void
+rfx_encode_rgb(RFX_CONTEXT * context, const uint8 * rgb_data, int width, int height, int rowstride,
+	const uint32 * y_quants, const uint32 * cb_quants, const uint32 * cr_quants,
+	uint8 * ycbcr_buffer, int buffer_size, int * y_size, int * cb_size, int * cr_size);
+
+#endif
+
diff --git a/libfreerdp-rfx/rfx_quantization.c b/libfreerdp-rfx/rfx_quantization.c
index 9d67c0c..67755c0 100644
--- a/libfreerdp-rfx/rfx_quantization.c
+++ b/libfreerdp-rfx/rfx_quantization.c
@@ -19,17 +19,63 @@
 
 #include "rfx_quantization.h"
 
+static void
+rfx_quantization_decode_block(sint16 * buffer, int buffer_size, uint32 factor)
+{
+	sint16 * dst;
+
+	if (factor <= 6)
+		return;
+
+	factor -= 6;
+	for (dst = buffer; buffer_size > 0; dst++, buffer_size--)
+	{
+		*dst <<= factor;
+	}
+}
+
 void
-rfx_quantization_decode(uint32 * buffer, int buffer_size, uint32 factor)
+rfx_quantization_decode(sint16 * buffer, const uint32 * quantization_values)
+{
+	rfx_quantization_decode_block(buffer, 1024, quantization_values[8]); /* HL1 */
+	rfx_quantization_decode_block(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
+	rfx_quantization_decode_block(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
+	rfx_quantization_decode_block(buffer + 3072, 256, quantization_values[5]); /* HL2 */
+	rfx_quantization_decode_block(buffer + 3328, 256, quantization_values[4]); /* LH2 */
+	rfx_quantization_decode_block(buffer + 3584, 256, quantization_values[6]); /* HH2 */
+	rfx_quantization_decode_block(buffer + 3840, 64, quantization_values[2]); /* HL3 */
+	rfx_quantization_decode_block(buffer + 3904, 64, quantization_values[1]); /* LH3 */
+	rfx_quantization_decode_block(buffer + 3868, 64, quantization_values[3]); /* HH3 */
+	rfx_quantization_decode_block(buffer + 4032, 64, quantization_values[0]); /* LL3 */
+}
+
+static void
+rfx_quantization_encode_block(sint16 * buffer, int buffer_size, uint32 factor)
 {
-	uint32 * dst;
+	sint16 * dst;
 
 	if (factor <= 6)
 		return;
 
+	factor -= 6;
 	for (dst = buffer; buffer_size > 0; dst++, buffer_size--)
 	{
-		*dst <<= (factor - 6);
+		*dst >>= factor;
 	}
 }
 
+void
+rfx_quantization_encode(sint16 * buffer, const uint32 * quantization_values)
+{
+	rfx_quantization_encode_block(buffer, 1024, quantization_values[8]); /* HL1 */
+	rfx_quantization_encode_block(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
+	rfx_quantization_encode_block(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
+	rfx_quantization_encode_block(buffer + 3072, 256, quantization_values[5]); /* HL2 */
+	rfx_quantization_encode_block(buffer + 3328, 256, quantization_values[4]); /* LH2 */
+	rfx_quantization_encode_block(buffer + 3584, 256, quantization_values[6]); /* HH2 */
+	rfx_quantization_encode_block(buffer + 3840, 64, quantization_values[2]); /* HL3 */
+	rfx_quantization_encode_block(buffer + 3904, 64, quantization_values[1]); /* LH3 */
+	rfx_quantization_encode_block(buffer + 3868, 64, quantization_values[3]); /* HH3 */
+	rfx_quantization_encode_block(buffer + 4032, 64, quantization_values[0]); /* LL3 */
+}
+
diff --git a/libfreerdp-rfx/rfx_quantization.h b/libfreerdp-rfx/rfx_quantization.h
index 53d35b8..6a37f28 100644
--- a/libfreerdp-rfx/rfx_quantization.h
+++ b/libfreerdp-rfx/rfx_quantization.h
@@ -23,7 +23,9 @@
 #include <freerdp/rfx.h>
 
 void
-rfx_quantization_decode(uint32 * buffer, int buffer_size, uint32 factor);
+rfx_quantization_decode(sint16 * buffer, const uint32 * quantization_values);
+void
+rfx_quantization_encode(sint16 * buffer, const uint32 * quantization_values);
 
 #endif
 
diff --git a/libfreerdp-rfx/rfx_rlgr.c b/libfreerdp-rfx/rfx_rlgr.c
index c727e7c..2d27907 100644
--- a/libfreerdp-rfx/rfx_rlgr.c
+++ b/libfreerdp-rfx/rfx_rlgr.c
@@ -56,7 +56,7 @@
 		nZeroesWritten = buffer_size; \
 	if (nZeroesWritten > 0) \
 	{ \
-		memset(dst, 0, nZeroesWritten * sizeof(int)); \
+		memset(dst, 0, nZeroesWritten * sizeof(sint16)); \
 		dst += nZeroesWritten; \
 	} \
 	buffer_size -= (nZeroes); \
@@ -72,10 +72,10 @@
 		_v >>= 1; \
 		_nbits++; \
 	} \
-} \
+}
 
 /* Converts from (2 * magnitude - sign) to integer */
-#define GetIntFrom2MagSign(twoMs) (((twoMs) & 1) ? -1 * (int)(((twoMs) + 1) >> 1) : (int)((twoMs) >> 1))
+#define GetIntFrom2MagSign(twoMs) (((twoMs) & 1) ? -1 * (sint16)(((twoMs) + 1) >> 1) : (sint16)((twoMs) >> 1))
 
 /*
  * Update the passed parameter and clamp it to the range [0, KPMAX]
@@ -94,11 +94,11 @@
 /* Outputs the Golomb/Rice encoding of a non-negative integer */
 #define GetGRCode(krp, kr) rfx_rlgr_get_gr_code(bs, krp, kr)
 
-static uint32
+static uint16
 rfx_rlgr_get_gr_code(RFX_BITSTREAM * bs, int * krp, int * kr)
 {
 	int vk;
-	uint32 mag;
+	uint16 mag;
 
 	/* chew up/count leading 1s and escape 0 */
 	for (vk = 0; GetBits(1) == 1;)
@@ -122,13 +122,13 @@ rfx_rlgr_get_gr_code(RFX_BITSTREAM * bs, int * krp, int * kr)
 }
 
 int
-rfx_rlgr_decode(RLGR_MODE mode, const uint8 * data, int data_size, uint32 * buffer, int buffer_size)
+rfx_rlgr_decode(RLGR_MODE mode, const uint8 * data, int data_size, sint16 * buffer, int buffer_size)
 {
 	int k;
 	int kp;
 	int kr;
 	int krp;
-	uint32 * dst;
+	sint16 * dst;
 	RFX_BITSTREAM * bs;
 
 	bs = rfx_bitstream_new();
@@ -230,3 +230,202 @@ rfx_rlgr_decode(RLGR_MODE mode, const uint8 * data, int data_size, uint32 * buff
 
 	return (dst - buffer);
 }
+
+/* Returns the next coefficient (a signed int) to encode, from the input stream */
+#define GetNextInput(_n) \
+{ \
+	if (data_size > 0) \
+	{ \
+		_n = *data++; \
+		data_size--; \
+	} \
+	else \
+	{ \
+		_n = 0; \
+	} \
+}
+
+/* Emit bitPattern to the output bitstream */
+#define OutputBits(numBits, bitPattern) rfx_bitstream_put_bits(bs, bitPattern, numBits);
+
+/* Emit a bit (0 or 1), count number of times, to the output bitstream */
+#define OutputBit(count, bit) \
+{	\
+	uint16 _b = (bit ? 0xFFFF : 0); \
+	int _c = (count); \
+	for (; _c > 0; _c -= 16) \
+		rfx_bitstream_put_bits(bs, _b, (_c > 16 ? 16 : _c)); \
+}
+
+/* Converts the input value to (2 * abs(input) - sign(input)), where sign(input) = (input < 0 ? 1 : 0) and returns it */
+#define Get2MagSign(input) ((input) >= 0 ? 2 * (input) : -2 * (input) - 1)
+
+/* Outputs the Golomb/Rice encoding of a non-negative integer */
+#define CodeGR(krp, val) rfx_rlgr_code_gr(bs, krp, val)
+
+static void
+rfx_rlgr_code_gr(RFX_BITSTREAM * bs, int * krp, uint16 val)
+{
+	int kr = *krp >> LSGR;
+
+	/* unary part of GR code */
+
+	uint16 vk = (val) >> kr;
+	OutputBit(vk, 1);
+	OutputBit(1, 0);
+
+	/* remainder part of GR code, if needed */
+	if (kr)
+	{
+		OutputBits(kr, val & ((1 << kr) - 1));
+	}
+
+	/* update krp, only if it is not equal to 1 */
+	if (vk == 0)
+	{
+		UpdateParam(*krp, -2, kr);
+	}
+ 	else if (vk > 1)
+	{
+		UpdateParam(*krp, vk, kr);
+	}
+}
+
+int
+rfx_rlgr_encode(RLGR_MODE mode, const sint16 * data, int data_size, uint8 * buffer, int buffer_size)
+{
+	int k;
+	int kp;
+	int kr;
+	int krp;
+	RFX_BITSTREAM * bs;
+	int processed_size;
+
+	bs = rfx_bitstream_new();
+	rfx_bitstream_put_buffer(bs, buffer, buffer_size);
+
+	/* initialize the parameters */
+	k = 1;
+	kp = 1 << LSGR;
+	kr = 1;
+	krp = 1 << LSGR;
+
+	/* process all the input coefficients */
+	while (data_size > 0)
+	{
+		int input;
+
+		if (k)
+		{
+			int numZeros;
+			int runmax;
+			int mag;
+			int sign;
+
+			/* RUN-LENGTH MODE */
+
+			/* collect the run of zeros in the input stream */
+			numZeros = 0;
+			GetNextInput(input);
+			while (input == 0 && data_size > 0)
+			{
+				numZeros++;
+				GetNextInput(input);
+			}
+
+			// emit output zeros
+			runmax = 1 << k;
+			while (numZeros >= runmax)
+			{
+				OutputBit(1, 0); /* output a zero bit */
+				numZeros -= runmax;
+				UpdateParam(kp, UP_GR, k); /* update kp, k */
+				runmax = 1 << k;
+			}
+
+			/* output a 1 to terminate runs */
+			OutputBit(1, 1);
+
+			/* output the remaining run length using k bits */
+			OutputBits(k, numZeros);
+
+			if (input != 0)
+			{
+				/* encode the nonzero value using GR coding */
+				mag = (input < 0 ? -input : input); /* absolute value of input coefficient */
+				sign = (input < 0 ? 1 : 0);  /* sign of input coefficient */
+
+				OutputBit(1, sign); /* output the sign bit */
+				CodeGR(&krp, mag - 1); /* output GR code for (mag - 1) */
+
+				UpdateParam(kp, -DN_GR, k);
+			}
+		}
+		else
+		{
+			/* GOLOMB-RICE MODE */
+
+			if (mode == RLGR1)
+			{
+				uint32 twoMs;
+
+				/* RLGR1 variant */
+
+				/* convert input to (2*magnitude - sign), encode using GR code */
+				GetNextInput(input);
+				twoMs = Get2MagSign(input);
+				CodeGR(&krp, twoMs);
+
+				/* update k, kp */
+				if (twoMs)
+				{
+					UpdateParam(kp, UQ_GR, k);
+				}
+				else
+				{
+					UpdateParam(kp, -DQ_GR, k);
+				}
+			}
+			else /* mode == RLGR3 */
+			{
+				uint32 twoMs1;
+				uint32 twoMs2;
+				uint32 sum2Ms;
+				uint32 nIdx;
+
+				/* RLGR3 variant */
+
+				/* convert the next two input values to (2*magnitude - sign) and */
+				/* encode their sum using GR code */
+
+				GetNextInput(input);
+				twoMs1 = Get2MagSign(input);
+				GetNextInput(input);
+				twoMs2 = Get2MagSign(input);
+				sum2Ms = twoMs1 + twoMs2;
+
+				CodeGR(&krp, sum2Ms);
+
+				/* encode binary representation of the first input (twoMs1). */
+				GetMinBits(sum2Ms, nIdx);
+				OutputBits(nIdx, twoMs1);
+
+				/* update k,kp for the two input values */
+
+				if (twoMs1 && twoMs2)
+				{
+					UpdateParam(kp, -2 * DQ_GR, k);
+				}
+				else if (!twoMs1 && !twoMs2)
+				{
+					UpdateParam(kp, 2 * UQ_GR, k);
+				}
+			}
+		}
+	}
+
+	processed_size = rfx_bitstream_get_processed_bytes(bs);
+	rfx_bitstream_free(bs);
+
+	return processed_size;
+}
diff --git a/libfreerdp-rfx/rfx_rlgr.h b/libfreerdp-rfx/rfx_rlgr.h
index 3f988c4..75aa7e4 100644
--- a/libfreerdp-rfx/rfx_rlgr.h
+++ b/libfreerdp-rfx/rfx_rlgr.h
@@ -23,7 +23,9 @@
 #include <freerdp/rfx.h>
 
 int
-rfx_rlgr_decode(RLGR_MODE mode, const uint8 * data, int data_size, uint32 * buffer, int buffer_size);
+rfx_rlgr_decode(RLGR_MODE mode, const uint8 * data, int data_size, sint16 * buffer, int buffer_size);
+int
+rfx_rlgr_encode(RLGR_MODE mode, const sint16 * data, int data_size, uint8 * buffer, int buffer_size);
 
 #endif
 
diff --git a/libfreerdp-rfx/sse/Makefile.am b/libfreerdp-rfx/sse/Makefile.am
index a97edae..faa9796 100644
--- a/libfreerdp-rfx/sse/Makefile.am
+++ b/libfreerdp-rfx/sse/Makefile.am
@@ -3,9 +3,13 @@
 # libfreerdp-rfx-sse
 noinst_LTLIBRARIES = libfreerdp-rfx-sse.la
 
-libfreerdp_rfx_sse_la_SOURCES = \
+libfreerdp_rfx_sse_la_SOURCES =
+
+if WITH_SSE
+libfreerdp_rfx_sse_la_SOURCES += \
 	rfx_sse.c rfx_sse.h \
 	rfx_sse2.c rfx_sse2.h
+endif
 
 libfreerdp_rfx_sse_la_CFLAGS = \
 	-I$(top_srcdir) \
diff --git a/libfreerdp-rfx/sse/rfx_sse.c b/libfreerdp-rfx/sse/rfx_sse.c
index 7353e37..76a632d 100644
--- a/libfreerdp-rfx/sse/rfx_sse.c
+++ b/libfreerdp-rfx/sse/rfx_sse.c
@@ -22,11 +22,23 @@
 #include <string.h>
 
 #include "rfx_sse2.h"
-
 #include "rfx_sse.h"
 
 void rfx_init_sse(RFX_CONTEXT * context)
 {
-	DEBUG_RFX("Using SSE2 optimizations");
-	context->decode_YCbCr_to_RGB = rfx_decode_YCbCr_to_RGB_SSE2;
+		DEBUG_RFX("Using SSE2 optimizations");
+
+		IF_PROFILER(context->prof_rfx_decode_YCbCr_to_RGB->name = "rfx_decode_YCbCr_to_RGB_SSE2");
+		IF_PROFILER(context->prof_rfx_encode_RGB_to_YCbCr->name = "rfx_encode_RGB_to_YCbCr_SSE2");
+		IF_PROFILER(context->prof_rfx_quantization_decode->name = "rfx_quantization_decode_SSE2");
+		IF_PROFILER(context->prof_rfx_quantization_encode->name = "rfx_quantization_encode_SSE2");
+		IF_PROFILER(context->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_SSE2");
+		IF_PROFILER(context->prof_rfx_dwt_2d_encode->name = "rfx_dwt_2d_encode_SSE2");
+		
+		context->decode_YCbCr_to_RGB = rfx_decode_YCbCr_to_RGB_SSE2;
+		context->encode_RGB_to_YCbCr = rfx_encode_RGB_to_YCbCr_SSE2;
+		context->quantization_decode = rfx_quantization_decode_SSE2;
+		context->quantization_encode = rfx_quantization_encode_SSE2;
+		context->dwt_2d_decode = rfx_dwt_2d_decode_SSE2;
+		context->dwt_2d_encode = rfx_dwt_2d_encode_SSE2;
 }
diff --git a/libfreerdp-rfx/sse/rfx_sse.h b/libfreerdp-rfx/sse/rfx_sse.h
index 48b8f29..e5be61c 100644
--- a/libfreerdp-rfx/sse/rfx_sse.h
+++ b/libfreerdp-rfx/sse/rfx_sse.h
@@ -31,20 +31,12 @@ void rfx_init_sse(RFX_CONTEXT * context);
 #define RFX_INIT_SIMD(_rfx_context) rfx_init_sse(_rfx_context)
 #endif
 
-static __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_between_ps (__m128 val, __m128 min, __m128 max)
+static __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_between_epi16 (__m128i val, __m128i min, __m128i max)
 {
-	__m128 ret;
-	ret = _mm_max_ps(val, min);
-	return _mm_min_ps(ret, max);
-}
-
-static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cvtps_epi32_and_store (__m128i * loc, __m128 val)
-{
-	__m128i tmp;
-	tmp = _mm_cvtps_epi32(val);
-	_mm_stream_si128(loc, tmp);
+	__m128i ret;
+	ret = _mm_max_epi16(val, min);
+	return _mm_min_epi16(ret, max);
 }
 
 #endif /* __RFX_SSE_H */
diff --git a/libfreerdp-rfx/sse/rfx_sse2.c b/libfreerdp-rfx/sse/rfx_sse2.c
index c796c92..3434ec5 100644
--- a/libfreerdp-rfx/sse/rfx_sse2.c
+++ b/libfreerdp-rfx/sse/rfx_sse2.c
@@ -25,57 +25,583 @@
 
 #include "rfx_sse2.h"
 
-void rfx_decode_YCbCr_to_RGB_SSE2(uint32 * y_r_buffer, uint32 * cb_g_buffer, uint32 * cr_b_buffer)
+#define CACHE_LINE_BYTES	64
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_prefetch_buffer(char * buffer, int num_bytes)
 {
-	__m128 y_add = _mm_set_ps1(128.0f);
-	__m128 r_cr_t = _mm_set_ps1(1.403f);
-	__m128 g_cb_t = _mm_set_ps1(-0.344f);
-	__m128 g_cr_t = _mm_set_ps1(-0.714f);
-	__m128 b_cb_t = _mm_set_ps1(1.77f);
+	__m128i * buf = (__m128i*) buffer;
+	int i;
+	for (i = 0; i < (num_bytes / sizeof(__m128i)); i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
+	{
+		_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
+	}
+}
+
+void
+rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer)
+{	
+	__m128i zero = _mm_setzero_si128();
+	__m128i max = _mm_set1_epi16(255);
+
+	__m128i * y_r_buf = (__m128i*) y_r_buffer;
+	__m128i * cb_g_buf = (__m128i*) cb_g_buffer;
+	__m128i * cr_b_buf = (__m128i*) cr_b_buffer;
+
+	__m128i y;
+	__m128i cr;
+	__m128i cb;
+	__m128i r;
+	__m128i g;
+	__m128i b;
+
+	int i;
 
-	__m128 min = _mm_set_ps1(0.0f);
-	__m128 max = _mm_set_ps1(255.0f);
+	for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
+	{
+		_mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
+		_mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
+		_mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
+	}
+	for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
+	{
+		/* y = y_r_buf[i] + 128; */
+		y = _mm_load_si128(&y_r_buf[i]);
+		y = _mm_add_epi16(y, _mm_set1_epi16(128));
 
-	__m128 y, cb, cr;
-	__m128 r, g, b, tmp;	
+		/* cr = cr_b_buf[i]; */
+		cr = _mm_load_si128(&cr_b_buf[i]);
+
+		/* r = between(y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5), 0, 255); */
+		r = _mm_add_epi16(y, cr);
+		r = _mm_add_epi16(r, _mm_srai_epi16(cr, 2));
+		r = _mm_add_epi16(r, _mm_srai_epi16(cr, 3));
+		r = _mm_add_epi16(r, _mm_srai_epi16(cr, 5));
+		r = _mm_between_epi16(r, zero, max);
+		_mm_store_si128(&y_r_buf[i], r);
+
+		/* cb = cb_g_buf[i]; */
+		cb = _mm_load_si128(&cb_g_buf[i]);
+
+		/* g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255); */
+		g = _mm_sub_epi16(y, _mm_srai_epi16(cb, 2));
+		g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 4));
+		g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 5));
+		g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 1));
+		g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 3));
+		g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 4));
+		g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 5));
+		g = _mm_between_epi16(g, zero, max);
+		_mm_store_si128(&cb_g_buf[i], g);		
+
+		/* b = between(y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6), 0, 255); */
+		b = _mm_add_epi16(y, cb);
+		b = _mm_add_epi16(b, _mm_srai_epi16(cb, 1));
+		b = _mm_add_epi16(b, _mm_srai_epi16(cb, 2));
+		b = _mm_add_epi16(b, _mm_srai_epi16(cb, 6));
+		b = _mm_between_epi16(b, zero, max);
+		_mm_store_si128(&cr_b_buf[i], b);
+	}
+}
+
+void
+rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer)
+{
+	__m128i min = _mm_set1_epi16(-128);
+	__m128i max = _mm_set1_epi16(127);
 
 	__m128i * y_r_buf = (__m128i*) y_r_buffer;
 	__m128i * cb_g_buf = (__m128i*) cb_g_buffer;
 	__m128i * cr_b_buf = (__m128i*) cr_b_buffer;
 
+	__m128i y;
+	__m128i cr;
+	__m128i cb;
+	__m128i r;
+	__m128i g;
+	__m128i b;
+
 	int i;
-	for (i = 0; i < (4096 / 4); i++)
+
+	for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
 	{
-		y = _mm_cvtepi32_ps(*y_r_buf);
-		cb = _mm_cvtepi32_ps(*cb_g_buf);
-		cr = _mm_cvtepi32_ps(*cr_b_buf);
-
-		/* y = y + 128 */
-		y = _mm_add_ps(y, y_add);
-
-		/* r = between(y + (cr * 1.403), 0, 255) */
-		r = _mm_mul_ps(cr, r_cr_t);
-		r = _mm_add_ps(r, y);
-		r = _mm_between_ps(r, min, max);
-		_mm_cvtps_epi32_and_store(y_r_buf, r);
-
-		/* g = between(y + (cb * -0.344) + (cr * -0.714), 0, 255) */
-		g = _mm_mul_ps(cb, g_cb_t);
-		tmp = _mm_mul_ps(cr, g_cr_t);
-		g = _mm_add_ps(g, tmp);
-		g = _mm_add_ps(g, y);
-		g = _mm_between_ps(g, min, max);
-		_mm_cvtps_epi32_and_store(cb_g_buf, g);
-
-		/* b = between(y + (cb * 1.77), 0, 255) */
-		b = _mm_mul_ps(cb, b_cb_t);
-		b = _mm_add_ps(b, y);
-		b = _mm_between_ps(b, min, max);
-		_mm_cvtps_epi32_and_store(cr_b_buf, b);
-
-		y_r_buf++;
-		cb_g_buf++;
-		cr_b_buf++;
+		_mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
+		_mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
+		_mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
 	}
+	for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
+	{
+		/* r = y_r_buf[i]; */
+		r = _mm_load_si128(&y_r_buf[i]);
+
+		/* g = cb_g_buf[i]; */
+		g = _mm_load_si128(&cb_g_buf[i]);
+
+		/* b = cr_b_buf[i]; */
+		b = _mm_load_si128(&cr_b_buf[i]);
+
+		/* y = ((r >> 2) + (r >> 5) + (r >> 6)) + ((g >> 1) + (g >> 4) + (g >> 6) + (g >> 7)) + ((b >> 4) + (b >> 5) + (b >> 6)); */
+		/* y_r_buf[i] = MINMAX(y, 0, 255) - 128; */
+		y = _mm_add_epi16(_mm_srai_epi16(r, 2), _mm_srai_epi16(r, 5));
+		y = _mm_add_epi16(y, _mm_srai_epi16(r, 6));
+		y = _mm_add_epi16(y, _mm_srai_epi16(g, 1));
+		y = _mm_add_epi16(y, _mm_srai_epi16(g, 4));
+		y = _mm_add_epi16(y, _mm_srai_epi16(g, 6));
+		y = _mm_add_epi16(y, _mm_srai_epi16(g, 7));
+		y = _mm_add_epi16(y, _mm_srai_epi16(b, 4));
+		y = _mm_add_epi16(y, _mm_srai_epi16(b, 5));
+		y = _mm_add_epi16(y, _mm_srai_epi16(b, 6));
+		y = _mm_add_epi16(y, min);
+		y = _mm_between_epi16(y, min, max);
+		_mm_store_si128(&y_r_buf[i], y);
+
+		/* cb = 0 - ((r >> 3) + (r >> 5) + (r >> 7)) - ((g >> 2) + (g >> 4) + (g >> 6)) + (b >> 1); */
+		/* cb_g_buf[i] = MINMAX(cb, -128, 127); */
+		cb = _mm_sub_epi16(_mm_srai_epi16(b, 1), _mm_srai_epi16(r, 3));
+		cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 5));
+		cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 7));
+		cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 2));
+		cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 4));
+		cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 6));
+		cb = _mm_between_epi16(cb, min, max);
+		_mm_store_si128(&cb_g_buf[i], cb);
+
+		/* cr = (r >> 1) - ((g >> 2) + (g >> 3) + (g >> 5) + (g >> 7)) - ((b >> 4) + (b >> 6)); */
+		/* cr_b_buf[i] = MINMAX(cr, -128, 127); */
+		cr = _mm_sub_epi16(_mm_srai_epi16(r, 1), _mm_srai_epi16(g, 2));
+		cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 3));
+		cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 5));
+		cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 7));
+		cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 4));
+		cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 6));
+		cr = _mm_between_epi16(cr, min, max);
+		_mm_store_si128(&cr_b_buf[i], cr);
+	}
+}
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_quantization_decode_block_SSE2(sint16 * buffer, const int buffer_size, const uint32 factor)
+{
+	int shift = factor-6;
+	if (shift <= 0)
+		return;
+	
+	__m128i a;
+	__m128i * ptr = (__m128i*) buffer;
+	__m128i * buf_end = (__m128i*) (buffer + buffer_size);
+	do
+	{
+		a = _mm_load_si128(ptr);
+		a = _mm_slli_epi16(a, shift);
+		_mm_store_si128(ptr, a);
+
+		ptr++;
+	} while(ptr < buf_end);
+}
+
+void
+rfx_quantization_decode_SSE2(sint16 * buffer, const uint32 * quantization_values)
+{
+	_mm_prefetch_buffer((char *) buffer, 4096 * sizeof(sint16));
+
+	rfx_quantization_decode_block_SSE2(buffer, 1024, quantization_values[8]); /* HL1 */
+	rfx_quantization_decode_block_SSE2(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
+	rfx_quantization_decode_block_SSE2(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
+	rfx_quantization_decode_block_SSE2(buffer + 3072, 256, quantization_values[5]); /* HL2 */
+	rfx_quantization_decode_block_SSE2(buffer + 3328, 256, quantization_values[4]); /* LH2 */
+	rfx_quantization_decode_block_SSE2(buffer + 3584, 256, quantization_values[6]); /* HH2 */
+	rfx_quantization_decode_block_SSE2(buffer + 3840, 64, quantization_values[2]); /* HL3 */
+	rfx_quantization_decode_block_SSE2(buffer + 3904, 64, quantization_values[1]); /* LH3 */
+	rfx_quantization_decode_block_SSE2(buffer + 3868, 64, quantization_values[3]); /* HH3 */
+	rfx_quantization_decode_block_SSE2(buffer + 4032, 64, quantization_values[0]); /* LL3 */
+}
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_quantization_encode_block_SSE2(sint16 * buffer, const int buffer_size, const uint32 factor)
+{
+	int shift = factor-6;
+	if (shift <= 0)
+		return;
+	
+	__m128i a;
+	__m128i * ptr = (__m128i*) buffer;
+	__m128i * buf_end = (__m128i*) (buffer + buffer_size);
+	do
+	{
+		a = _mm_load_si128(ptr);
+		a = _mm_srai_epi16(a, shift);
+		_mm_store_si128(ptr, a);
+
+		ptr++;
+	} while(ptr < buf_end);
+}
+
+void
+rfx_quantization_encode_SSE2(sint16 * buffer, const uint32 * quantization_values)
+{
+	_mm_prefetch_buffer((char *) buffer, 4096 * sizeof(sint16));
+
+	rfx_quantization_encode_block_SSE2(buffer, 1024, quantization_values[8]); /* HL1 */
+	rfx_quantization_encode_block_SSE2(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
+	rfx_quantization_encode_block_SSE2(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
+	rfx_quantization_encode_block_SSE2(buffer + 3072, 256, quantization_values[5]); /* HL2 */
+	rfx_quantization_encode_block_SSE2(buffer + 3328, 256, quantization_values[4]); /* LH2 */
+	rfx_quantization_encode_block_SSE2(buffer + 3584, 256, quantization_values[6]); /* HH2 */
+	rfx_quantization_encode_block_SSE2(buffer + 3840, 64, quantization_values[2]); /* HL3 */
+	rfx_quantization_encode_block_SSE2(buffer + 3904, 64, quantization_values[1]); /* LH3 */
+	rfx_quantization_encode_block_SSE2(buffer + 3868, 64, quantization_values[3]); /* HH3 */
+	rfx_quantization_encode_block_SSE2(buffer + 4032, 64, quantization_values[0]); /* LL3 */
 }
 
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_dwt_2d_decode_block_horiz_SSE2(sint16 * l, sint16 * h, sint16 * dst, int subband_width)
+{
+	int y, n;
+	sint16 * l_ptr = l;
+	sint16 * h_ptr = h;
+	sint16 * dst_ptr = dst;
+	int first;
+	int last;
+	__m128i l_n;
+	__m128i h_n;
+	__m128i h_n_m;
+	__m128i tmp_n;
+	__m128i dst_n;
+	__m128i dst_n_p;
+	__m128i dst1;
+	__m128i dst2;
+
+	for (y = 0; y < subband_width; y++)
+	{
+		/* Even coefficients */
+		for (n = 0; n < subband_width; n+=8)
+		{
+			/* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
+			
+			l_n = _mm_load_si128((__m128i*) l_ptr);
+
+			h_n = _mm_load_si128((__m128i*) h_ptr);
+			h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1));
+			if (n == 0)
+			{
+				first = _mm_extract_epi16(h_n_m, 1);
+				h_n_m = _mm_insert_epi16(h_n_m, first, 0);
+			}
+			
+			tmp_n = _mm_add_epi16(h_n, h_n_m);
+			tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
+			tmp_n = _mm_srai_epi16(tmp_n, 1);
+			
+			dst_n = _mm_sub_epi16(l_n, tmp_n);
+			
+			_mm_store_si128((__m128i*) l_ptr, dst_n);
+			
+			l_ptr+=8;
+			h_ptr+=8;
+		}
+		l_ptr -= subband_width;
+		h_ptr -= subband_width;
+		
+		/* Odd coefficients */
+		for (n = 0; n < subband_width; n+=8)
+		{
+			/* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
+			
+			h_n = _mm_load_si128((__m128i*) h_ptr);
+			
+			h_n = _mm_slli_epi16(h_n, 1);
+			
+			dst_n = _mm_load_si128((__m128i*) (l_ptr));
+			dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1));
+			if (n == subband_width - 8)
+			{
+				last = _mm_extract_epi16(dst_n_p, 6);
+				dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
+			}
+			
+			tmp_n = _mm_add_epi16(dst_n_p, dst_n);
+			tmp_n = _mm_srai_epi16(tmp_n, 1);
+			
+			tmp_n = _mm_add_epi16(tmp_n, h_n);
+			
+			dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
+			dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
+			
+			_mm_store_si128((__m128i*) dst_ptr, dst1);
+			_mm_store_si128((__m128i*) (dst_ptr + 8), dst2);
+			
+			l_ptr+=8;
+			h_ptr+=8;
+			dst_ptr+=16;
+		}
+	}
+}
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_dwt_2d_decode_block_vert_SSE2(sint16 * l, sint16 * h, sint16 * dst, int subband_width)
+{
+	int x, n;
+	sint16 * l_ptr = l;
+	sint16 * h_ptr = h;
+	sint16 * dst_ptr = dst;
+	__m128i l_n;
+	__m128i h_n;
+	__m128i tmp_n;
+	__m128i h_n_m;
+	__m128i dst_n;
+	__m128i dst_n_m;
+	__m128i dst_n_p;
+	
+	int total_width = subband_width + subband_width;
+
+	/* Even coefficients */
+	for (n = 0; n < subband_width; n++)
+	{
+		for (x = 0; x < total_width; x+=8)
+		{
+			/* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
+			
+			l_n = _mm_load_si128((__m128i*) l_ptr);
+			h_n = _mm_load_si128((__m128i*) h_ptr);
+			
+			tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));;
+			if (n == 0)
+				tmp_n = _mm_add_epi16(tmp_n, h_n);
+			else
+			{
+				h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width));
+				tmp_n = _mm_add_epi16(tmp_n, h_n_m);
+			}
+			tmp_n = _mm_srai_epi16(tmp_n, 1);
+			
+			dst_n = _mm_sub_epi16(l_n, tmp_n);
+			_mm_store_si128((__m128i*) dst_ptr, dst_n);
+			
+			l_ptr+=8;
+			h_ptr+=8;
+			dst_ptr+=8;
+		}
+		dst_ptr+=total_width;
+	}
+	
+	h_ptr = h;
+	dst_ptr = dst + total_width;
+	
+	/* Odd coefficients */
+	for (n = 0; n < subband_width; n++)
+	{
+		for (x = 0; x < total_width; x+=8)
+		{
+			/* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
+			
+			h_n = _mm_load_si128((__m128i*) h_ptr);
+			dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width));
+			h_n = _mm_slli_epi16(h_n, 1);
+			
+			tmp_n = dst_n_m;
+			if (n == subband_width - 1)
+				tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
+			else
+			{
+				dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width));
+				tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
+			}
+			tmp_n = _mm_srai_epi16(tmp_n, 1);
+			
+			dst_n = _mm_add_epi16(tmp_n, h_n);
+			_mm_store_si128((__m128i*) dst_ptr, dst_n);
+
+			h_ptr+=8;
+			dst_ptr+=8;
+		}
+		dst_ptr+=total_width;
+	}
+}
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_dwt_2d_decode_block_SSE2(sint16 * buffer, sint16 * idwt, int subband_width)
+{
+	sint16 * hl, * lh, * hh, * ll;
+	sint16 * l_dst, * h_dst;
+
+	_mm_prefetch_buffer((char *) idwt, subband_width * 4 * sizeof(sint16));
+
+	/* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt. */
+	/* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
+	/* The lower part L uses LL(3) and HL(0). */
+	/* The higher part H uses LH(1) and HH(2). */
+
+	ll = buffer + subband_width * subband_width * 3;
+	hl = buffer;
+	l_dst = idwt;
+
+	rfx_dwt_2d_decode_block_horiz_SSE2(ll, hl, l_dst, subband_width);
+
+	lh = buffer + subband_width * subband_width;
+	hh = buffer + subband_width * subband_width * 2;
+	h_dst = idwt + subband_width * subband_width * 2;
+	
+	rfx_dwt_2d_decode_block_horiz_SSE2(lh, hh, h_dst, subband_width);
+
+	/* Inverse DWT in vertical direction, results are stored in original buffer. */
+	rfx_dwt_2d_decode_block_vert_SSE2(l_dst, h_dst, buffer, subband_width);
+}
+
+void
+rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer)
+{
+	_mm_prefetch_buffer((char *) buffer, 4096 * sizeof(sint16));
+	
+	rfx_dwt_2d_decode_block_SSE2(buffer + 3840, dwt_buffer, 8);
+	rfx_dwt_2d_decode_block_SSE2(buffer + 3072, dwt_buffer, 16);
+	rfx_dwt_2d_decode_block_SSE2(buffer, dwt_buffer, 32);
+}
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_dwt_2d_encode_block_vert_SSE2(sint16 * src, sint16 * l, sint16 * h, int subband_width)
+{
+	int total_width;
+	int x;
+	int n;
+	__m128i src_2n;
+	__m128i src_2n_1;
+	__m128i src_2n_2;
+	__m128i h_n;
+	__m128i h_n_m;
+	__m128i l_n;
+
+	total_width = subband_width << 1;
+
+	for (n = 0; n < subband_width; n++)
+	{
+		for (x = 0; x < total_width; x += 8)
+		{
+			src_2n = _mm_load_si128((__m128i*) src);
+			src_2n_1 = _mm_load_si128((__m128i*) (src + total_width));
+			if (n < subband_width - 1)
+				src_2n_2 = _mm_load_si128((__m128i*) (src + 2 * total_width));
+			else
+				src_2n_2 = src_2n_1;
+
+			/* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
+
+			h_n = _mm_add_epi16(src_2n, src_2n_2);
+			h_n = _mm_srai_epi16(h_n, 1);
+			h_n = _mm_sub_epi16(src_2n_1, h_n);
+			h_n = _mm_srai_epi16(h_n, 1);
+
+			_mm_store_si128((__m128i*) h, h_n);
+
+			if (n == 0)
+				h_n_m = h_n;
+			else
+				h_n_m = _mm_load_si128((__m128i*) (h - total_width));
+
+			/* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
+
+			l_n = _mm_add_epi16(h_n_m, h_n);
+			l_n = _mm_srai_epi16(l_n, 1);
+			l_n = _mm_add_epi16(l_n, src_2n);
+
+			_mm_store_si128((__m128i*) l, l_n);
+
+			src += 8;
+			l += 8;
+			h += 8;
+		}
+		src += total_width;
+	}
+}
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_dwt_2d_encode_block_horiz_SSE2(sint16 * src, sint16 * l, sint16 * h, int subband_width)
+{
+	int y;
+	int n;
+	int first;
+	__m128i src_2n;
+	__m128i src_2n_1;
+	__m128i src_2n_2;
+	__m128i h_n;
+	__m128i h_n_m;
+	__m128i l_n;
+
+	for (y = 0; y < subband_width; y++)
+	{
+		for (n = 0; n < subband_width; n += 8)
+		{
+			/* The following 3 Set operations consumes more than half of the total DWT processing time! */
+			src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
+			src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
+			src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[15] : src[16],
+				src[14], src[12], src[10], src[8], src[6], src[4], src[2]);
+
+			/* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
+
+			h_n = _mm_add_epi16(src_2n, src_2n_2);
+			h_n = _mm_srai_epi16(h_n, 1);
+			h_n = _mm_sub_epi16(src_2n_1, h_n);
+			h_n = _mm_srai_epi16(h_n, 1);
+
+			_mm_store_si128((__m128i*) h, h_n);
+
+			h_n_m = _mm_loadu_si128((__m128i*) (h - 1));
+			if (n == 0)
+			{
+				first = _mm_extract_epi16(h_n_m, 1);
+				h_n_m = _mm_insert_epi16(h_n_m, first, 0);
+			}
+
+			/* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
+
+			l_n = _mm_add_epi16(h_n_m, h_n);
+			l_n = _mm_srai_epi16(l_n, 1);
+			l_n = _mm_add_epi16(l_n, src_2n);
+
+			_mm_store_si128((__m128i*) l, l_n);
+
+			src += 16;
+			l += 8;
+			h += 8;
+		}
+	}
+}
+
+static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+rfx_dwt_2d_encode_block_SSE2(sint16 * buffer, sint16 * dwt, int subband_width)
+{
+	sint16 * hl, * lh, * hh, * ll;
+	sint16 * l_src, * h_src;
+
+	_mm_prefetch_buffer((char *) dwt, subband_width * 4 * sizeof(sint16));
+
+	/* DWT in vertical direction, results in 2 sub-bands in L, H order in tmp buffer dwt. */
+
+	l_src = dwt;
+	h_src = dwt + subband_width * subband_width * 2;
+
+	rfx_dwt_2d_encode_block_vert_SSE2(buffer, l_src, h_src, subband_width);
+
+	/* DWT in horizontal direction, results in 4 sub-bands in HL(0), LH(1), HH(2), LL(3) order, stored in original buffer. */
+	/* The lower part L generates LL(3) and HL(0). */
+	/* The higher part H generates LH(1) and HH(2). */
+
+	ll = buffer + subband_width * subband_width * 3;
+	hl = buffer;
+
+	lh = buffer + subband_width * subband_width;
+	hh = buffer + subband_width * subband_width * 2;
+
+	rfx_dwt_2d_encode_block_horiz_SSE2(l_src, ll, hl, subband_width);
+	rfx_dwt_2d_encode_block_horiz_SSE2(h_src, lh, hh, subband_width);
+}
+
+void
+rfx_dwt_2d_encode_SSE2(sint16 * buffer, sint16 * dwt_buffer)
+{
+	_mm_prefetch_buffer((char *) buffer, 4096 * sizeof(sint16));
+	
+	rfx_dwt_2d_encode_block_SSE2(buffer, dwt_buffer, 32);
+	rfx_dwt_2d_encode_block_SSE2(buffer + 3072, dwt_buffer, 16);
+	rfx_dwt_2d_encode_block_SSE2(buffer + 3840, dwt_buffer, 8);
+}
diff --git a/libfreerdp-rfx/sse/rfx_sse2.h b/libfreerdp-rfx/sse/rfx_sse2.h
index 0c16180..85921da 100644
--- a/libfreerdp-rfx/sse/rfx_sse2.h
+++ b/libfreerdp-rfx/sse/rfx_sse2.h
@@ -22,6 +22,11 @@
 
 #include <freerdp/rfx.h>
 
-void rfx_decode_YCbCr_to_RGB_SSE2(uint32 * y_r_buffer, uint32 * cb_g_buffer, uint32 * cr_b_buffer);
+void rfx_decode_YCbCr_to_RGB_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer);
+void rfx_encode_RGB_to_YCbCr_SSE2(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer);
+void rfx_quantization_decode_SSE2(sint16 * buffer, const uint32 * quantization_values);
+void rfx_quantization_encode_SSE2(sint16 * buffer, const uint32 * quantization_values);
+void rfx_dwt_2d_decode_SSE2(sint16 * buffer, sint16 * dwt_buffer);
+void rfx_dwt_2d_encode_SSE2(sint16 * buffer, sint16 * dwt_buffer);
 
 #endif /* __RFX_SSE2_H */
diff --git a/libfreerdp-utils/Makefile.am b/libfreerdp-utils/Makefile.am
index 6db6df1..d6912fd 100644
--- a/libfreerdp-utils/Makefile.am
+++ b/libfreerdp-utils/Makefile.am
@@ -11,7 +11,9 @@ libfreerdp_utils_la_SOURCES = \
 	semaphore.c \
 	unicode.c \
 	wait_obj.c \
-	chan_plugin.c
+	chan_plugin.c \
+	stopwatch.c \
+	profiler.c
 
 libfreerdp_utils_la_CFLAGS = \
 	-I$(top_srcdir) \
diff --git a/libfreerdp-utils/profiler.c b/libfreerdp-utils/profiler.c
new file mode 100644
index 0000000..308c78c
--- /dev/null
+++ b/libfreerdp-utils/profiler.c
@@ -0,0 +1,72 @@
+/*
+   FreeRDP: A Remote Desktop Protocol client.
+   Profiler Utils
+
+   Copyright 2011 Stephen Erisman
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#include <freerdp/utils/profiler.h>
+
+PROFILER * profiler_create(char * name)
+{
+	PROFILER * profiler;
+
+	profiler = (PROFILER *) xmalloc(sizeof(PROFILER));
+	
+	profiler->name = name;
+	profiler->stopwatch = stopwatch_create();
+
+	return profiler;
+}
+
+void profiler_free(PROFILER * profiler)
+{	
+	stopwatch_free(profiler->stopwatch);
+	
+	xfree(profiler);
+}
+
+void profiler_enter(PROFILER * profiler)
+{
+	stopwatch_start(profiler->stopwatch);
+}
+
+void profiler_exit(PROFILER * profiler)
+{
+	stopwatch_stop(profiler->stopwatch);
+}
+
+void profiler_print_header()
+{
+	printf("\n");
+	printf("                                             |-----------------------|\n" );
+	printf("                PROFILER                     |    elapsed seconds    |\n" );
+	printf("|--------------------------------------------|-----------------------|\n" );
+	printf("| code section                  | iterations |     total |      avg. |\n" );
+	printf("|-------------------------------|------------|-----------|-----------|\n" );
+}
+
+void profiler_print(PROFILER * profiler)
+{
+	double elapsed_sec = stopwatch_get_elapsed_time_in_seconds(profiler->stopwatch);
+	double avg_sec = elapsed_sec / (double) profiler->stopwatch->count;
+	
+	printf("| %-30.30s| %'10lu | %'9f | %'9f |\n", profiler->name, profiler->stopwatch->count, elapsed_sec, avg_sec);
+}
+
+void profiler_print_footer()
+{
+	printf("|--------------------------------------------------------------------|\n" );
+}
diff --git a/libfreerdp-utils/stopwatch.c b/libfreerdp-utils/stopwatch.c
new file mode 100644
index 0000000..c13365c
--- /dev/null
+++ b/libfreerdp-utils/stopwatch.c
@@ -0,0 +1,60 @@
+/*
+   FreeRDP: A Remote Desktop Protocol client.
+   Stopwatch Utils
+
+   Copyright 2011 Stephen Erisman
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#include <freerdp/utils/stopwatch.h>
+
+STOPWATCH * stopwatch_create()
+{
+	STOPWATCH * sw;
+
+	sw = (STOPWATCH *) xmalloc(sizeof(STOPWATCH));
+	stopwatch_reset(sw);
+
+	return sw;
+}
+
+void stopwatch_free(STOPWATCH * stopwatch)
+{
+	xfree(stopwatch);
+}
+
+void stopwatch_start(STOPWATCH * stopwatch)
+{
+	stopwatch->start = clock();
+	stopwatch->count++;
+}
+
+void stopwatch_stop(STOPWATCH * stopwatch)
+{
+	stopwatch->end = clock();
+	stopwatch->elapsed += (stopwatch->end - stopwatch->start);
+}
+
+void stopwatch_reset(STOPWATCH * stopwatch)
+{
+	stopwatch->start = 0;
+	stopwatch->end = 0;
+	stopwatch->elapsed = 0;
+	stopwatch->count = 0;
+}
+
+double stopwatch_get_elapsed_time_in_seconds(STOPWATCH * stopwatch)
+{
+	return ((double)stopwatch->elapsed) / CLOCKS_PER_SEC;
+}
author	Marc-André Moreau <marcandre.moreau@gmail.com>	2011-06-28 18:19:40 +0400
committer	Marc-André Moreau <marcandre.moreau@gmail.com>	2011-06-28 18:19:40 +0400
commit	46c1d75a72521b548e8f0f677b86f3aab01027e8 (patch)
tree	0a5f57437e5c7cfcfee3db4a71564bc4211a4b06
parent	b2e7d3410410a6e79e95ed6c3ba354fdd4e7ed05 (diff)
parent	c78f823e681a9e53cb8f89d78ab9bdf65405656c (diff)