Progressive RFX for EGFX protocol

Largely a consolidation of the work by jsorg71, with a few minor bug fixes. Verified that RFX Progressive works.
author: Jay Sorg <jay.sorg@gmail.com> 2020-12-07 11:29:46 +0300
committer: Nexarian <cmp@pitstick.net> 2022-05-09 01:54:06 +0300
commit: 34d051363112b16b7df511f08524f69a925939a5 (patch)
tree: e4e9a53f1cee17735cc237101a9bacc74792085a
parent: d8f126abc48a1b949a0be27b334099161c06f0fc (diff)
26 files changed, 1850 insertions, 65 deletions
diff --git a/include/rfxcodec_common.h b/include/rfxcodec_common.h
index 0411c73..616d016 100644
--- a/include/rfxcodec_common.h
+++ b/include/rfxcodec_common.h
@@ -31,6 +31,7 @@
 #define RFX_FLAGS_OPT1    (1 << 3)
 #define RFX_FLAGS_OPT2    (1 << 4)
 #define RFX_FLAGS_NOACCEL (1 << 6)
+#define RFX_FLAGS_PRO1    (1 << 7)
 
 #define RFX_FLAGS_RLGR3 0 /* default */
 #define RFX_FLAGS_RLGR1 1
diff --git a/include/rfxcodec_encode.h b/include/rfxcodec_encode.h
index 2c5876f..76e5e6a 100644
--- a/include/rfxcodec_encode.h
+++ b/include/rfxcodec_encode.h
@@ -80,8 +80,8 @@ typedef int (*rfxencode_differential_proc)(short *buffer, int buffer_size);
 typedef int (*rfxencode_quantization_proc)(short *buffer, const char *quantization_values);
 typedef int (*rfxencode_dwt_2d_proc)(const unsigned char *in_buffer, short *buffer, short *dwt_buffer);
 
-typedef int (*rfxencode_diff_rlgr1_proc)(short *coef, unsigned char *cdata, int cdata_size);
-typedef int (*rfxencode_diff_rlgr3_proc)(short *coef, unsigned char *cdata, int cdata_size);
+typedef int (*rfxencode_diff_rlgr1_proc)(short *coef, unsigned char *cdata, int cdata_size, int diff_bytes);
+typedef int (*rfxencode_diff_rlgr3_proc)(short *coef, unsigned char *cdata, int cdata_size, int diff_bytes);
 
 typedef int (*rfxencode_dwt_shift_x86_sse2_proc)(const char *qtable, const unsigned char *data, short *dwt_buffer1, short *dwt_buffer);
 typedef int (*rfxencode_dwt_shift_x86_sse41_proc)(const char *qtable, const unsigned char *data, short *dwt_buffer1, short *dwt_buffer);
diff --git a/src/Makefile.am b/src/Makefile.am
index 5b7983d..83b5171 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -32,7 +32,9 @@ noinst_HEADERS = \
   rfxencode_tile.h \
   rfxencode_diff_rlgr1.h \
   rfxencode_diff_rlgr3.h \
-  rfxencode_rgb_to_yuv.h
+  rfxencode_rgb_to_yuv.h \
+  rfxencode_dwt_rem.h \
+  rfxencode_dwt_shift_rem.h
 
 lib_LTLIBRARIES = librfxencode.la
 
@@ -41,4 +43,6 @@ librfxencode_la_SOURCES = $(noinst_HEADERS) rfxencode.c \
   rfxencode_quantization.c rfxencode_differential.c \
   rfxencode_rlgr1.c rfxencode_rlgr3.c rfxencode_alpha.c \
   rfxencode_diff_rlgr1.c rfxencode_diff_rlgr3.c \
-  rfxencode_rgb_to_yuv.c
+  rfxencode_rgb_to_yuv.c \
+  rfxencode_dwt_rem.c \
+  rfxencode_dwt_shift_rem.c
diff --git a/src/amd64/rfxencode_tile_amd64.c b/src/amd64/rfxencode_tile_amd64.c
index 1619752..2ce5037 100644
--- a/src/amd64/rfxencode_tile_amd64.c
+++ b/src/amd64/rfxencode_tile_amd64.c
@@ -53,7 +53,7 @@ rfx_encode_component_rlgr1_amd64_sse2(struct rfxencode *enc, const char *qtable,
     {
         return 1;
     }
-    *size = rfx_encode_diff_rlgr1(enc->dwt_buffer1, buffer, buffer_size);
+    *size = rfx_encode_diff_rlgr1(enc->dwt_buffer1, buffer, buffer_size, 64);
     return 0;
 }
 
@@ -69,7 +69,7 @@ rfx_encode_component_rlgr3_amd64_sse2(struct rfxencode *enc, const char *qtable,
     {
         return 1;
     }
-    *size = rfx_encode_diff_rlgr3(enc->dwt_buffer1, buffer, buffer_size);
+    *size = rfx_encode_diff_rlgr3(enc->dwt_buffer1, buffer, buffer_size, 64);
     return 0;
 }
 
@@ -85,7 +85,7 @@ rfx_encode_component_rlgr1_amd64_sse41(struct rfxencode *enc, const char *qtable
     {
         return 1;
     }
-    *size = rfx_encode_diff_rlgr1(enc->dwt_buffer1, buffer, buffer_size);
+    *size = rfx_encode_diff_rlgr1(enc->dwt_buffer1, buffer, buffer_size, 64);
     return 0;
 }
 
@@ -101,6 +101,6 @@ rfx_encode_component_rlgr3_amd64_sse41(struct rfxencode *enc, const char *qtable
     {
         return 1;
     }
-    *size = rfx_encode_diff_rlgr3(enc->dwt_buffer1, buffer, buffer_size);
+    *size = rfx_encode_diff_rlgr3(enc->dwt_buffer1, buffer, buffer_size, 64);
     return 0;
 }
diff --git a/src/rfxcommon.h b/src/rfxcommon.h
index dad376c..2d208af 100644
--- a/src/rfxcommon.h
+++ b/src/rfxcommon.h
@@ -84,6 +84,9 @@ typedef struct _STREAM STREAM;
 } while (0)
 #endif
 
+#define stream_read(_s, _b, _n) do { memcpy(_b, (_s)->p, _n); (_s)->p += _n; } while (0)
+#define stream_write(_s, _b, _n) do { memcpy((_s)->p, _b, _n); (_s)->p += _n; } while (0)
+
 #define stream_seek(_s, _n) (_s)->p += _n
 #define stream_seek_uint8(_s) (_s)->p += 1
 #define stream_seek_uint16(_s) (_s)->p += 2
diff --git a/src/rfxconstants.h b/src/rfxconstants.h
index 770fccb..77f487c 100644
--- a/src/rfxconstants.h
+++ b/src/rfxconstants.h
@@ -44,6 +44,22 @@ enum _RLGR_MODE
 #define CBT_TILESET             0xCAC2
 #define CBT_TILE                0xCAC3
 
+/* progressive blockType */
+#define PRO_WBT_SYNC                        0xCCC0
+#define PRO_WBT_FRAME_BEGIN                 0xCCC1
+#define PRO_WBT_FRAME_END                   0xCCC2
+#define PRO_WBT_CONTEXT                     0xCCC3
+#define PRO_WBT_REGION                      0xCCC4
+#define PRO_WBT_TILE_SIMPLE                 0xCCC5
+#define PRO_WBT_TILE_PROGRESSIVE_FIRST      0xCCC6
+#define PRO_WBT_TILE_PROGRESSIVE_UPGRADE    0xCCC7
+
+#define RFX_SUBBAND_DIFFING     0x01
+
+#define RFX_DWT_REDUCE_EXTRAPOLATE      0x01
+
+#define RFX_TILE_DIFFERENCE     0x01
+
 /* tileSize */
 #define CT_TILE_64x64           0x0040
 
diff --git a/src/rfxencode.c b/src/rfxencode.c
index 7822f14..9b7826a 100644
--- a/src/rfxencode.c
+++ b/src/rfxencode.c
@@ -68,6 +68,10 @@ rfxcodec_encode_create_ex(int width, int height, int format, int flags,
     enc->dwt_buffer = (sint16 *) (((size_t) (enc->dwt_buffer_a)) & ~15);
     enc->dwt_buffer1 = (sint16 *) (((size_t) (enc->dwt_buffer1_a)) & ~15);
     enc->dwt_buffer2 = (sint16 *) (((size_t) (enc->dwt_buffer2_a)) & ~15);
+    enc->dwt_buffer3 = (sint16 *) (((size_t) (enc->dwt_buffer3_a)) & ~15);
+    enc->dwt_buffer4 = (sint16 *) (((size_t) (enc->dwt_buffer4_a)) & ~15);
+    enc->dwt_buffer5 = (sint16 *) (((size_t) (enc->dwt_buffer5_a)) & ~15);
+    enc->dwt_buffer6 = (sint16 *) (((size_t) (enc->dwt_buffer6_a)) & ~15);
 
 #if defined(RFX_USE_ACCEL_X86)
     cpuid_x86(1, 0, &ax, &bx, &cx, &dx);
@@ -157,7 +161,11 @@ rfxcodec_encode_create_ex(int width, int height, int format, int flags,
     enc->rfx_encode_rgb_to_yuv = rfx_encode_rgb_to_yuv;
     enc->rfx_encode_argb_to_yuva = rfx_encode_argb_to_yuva;
     /* assign encoding functions */
-    if (flags & RFX_FLAGS_NOACCEL)
+    if (flags & RFX_FLAGS_PRO1)
+    {
+        enc->pro_ver = 1;
+    }
+    else if (flags & RFX_FLAGS_NOACCEL)
     {
         if (enc->mode == RLGR3)
         {
@@ -295,12 +303,21 @@ int
 rfxcodec_encode_destroy(void *handle)
 {
     struct rfxencode *enc;
+    int index;
+    int jndex;
 
     enc = (struct rfxencode *) handle;
     if (enc == NULL)
     {
         return 0;
     }
+    for (index = 0; index < 64; index++)
+    {
+        for (jndex = 0; jndex < 64; jndex++)
+        {
+            free(enc->rbs[index][jndex]);
+        }
+    }
     free(enc);
     return 0;
 }
@@ -323,6 +340,28 @@ rfxcodec_encode_ex(void *handle, char *cdata, int *cdata_bytes,
     s.p = s.data;
     s.size = *cdata_bytes;
 
+    if (enc->pro_ver > 0)
+    {
+        /* Only the first frame should send the RemoteFX header */
+        if ((enc->frame_idx == 0) && (enc->header_processed == 0))
+        {
+            if (rfx_pro_compose_message_header(enc, &s) != 0)
+            {
+                return -1;
+            }
+        }
+        tiles_written = rfx_pro_compose_message_data(enc, &s, regions, num_regions,
+                                         buf, width, height, stride_bytes,
+                                         tiles, num_tiles, quants, num_quants,
+                                         flags);
+        if (tiles_written <= 0)
+        {
+            return -1;
+        }
+        *cdata_bytes = (int) (s.p - s.data);
+        return tiles_written;
+    }
+
     /* Only the first frame should send the RemoteFX header */
     if ((enc->frame_idx == 0) && (enc->header_processed == 0))
     {
@@ -374,3 +413,48 @@ rfxcodec_encode_get_internals(struct rfxcodec_encode_internals *internals)
 #endif
     return 0;
 }
+
+/*****************************************************************************/
+/* produce a hex dump */
+void
+rfxcodec_hexdump(const void *p, int len)
+{
+    unsigned char *line;
+    int i;
+    int thisline;
+    int offset;
+
+    line = (unsigned char *)p;
+    offset = 0;
+
+    while (offset < len)
+    {
+        printf("%04x ", offset);
+        thisline = len - offset;
+
+        if (thisline > 16)
+        {
+            thisline = 16;
+        }
+
+        for (i = 0; i < thisline; i++)
+        {
+            printf("%02x ", line[i]);
+        }
+
+        for (; i < 16; i++)
+        {
+            printf("   ");
+        }
+
+        for (i = 0; i < thisline; i++)
+        {
+            printf("%c", (line[i] >= 0x20 && line[i] < 0x7f) ? line[i] : '.');
+        }
+
+        printf("%s", "\n");
+        offset += thisline;
+        line += thisline;
+    }
+}
+
diff --git a/src/rfxencode.h b/src/rfxencode.h
index 8e185dc..28b1479 100644
--- a/src/rfxencode.h
+++ b/src/rfxencode.h
@@ -33,6 +33,17 @@ typedef int (*rfx_encode_proc)(struct rfxencode *enc, const char *qtable,
                                const uint8 *data,
                                uint8 *buffer, int buffer_size, int *size);
 
+struct rfx_rb
+{
+    sint16 y[4096];
+    sint16 u[4096];
+    sint16 v[4096];
+};
+
+
+#define RFX_MAX_RB_X 64
+#define RFX_MAX_RB_Y 64
+
 struct rfxencode
 {
     int width;
@@ -44,7 +55,8 @@ struct rfxencode
     int flags;
     int bits_per_pixel;
     int format;
-    int pad0[7];
+    int pro_ver;
+    int pad0[6];
 
     uint8 a_buffer[4096];
     uint8 y_r_buffer[4096];
@@ -54,13 +66,24 @@ struct rfxencode
     sint16 dwt_buffer_a[4096];
     sint16 dwt_buffer1_a[4096];
     sint16 dwt_buffer2_a[4096];
+    sint16 dwt_buffer3_a[4096];
+    sint16 dwt_buffer4_a[4096];
+    sint16 dwt_buffer5_a[4096];
+    sint16 dwt_buffer6_a[4096];
     uint8 pad2[16];
     sint16 *dwt_buffer;
     sint16 *dwt_buffer1;
     sint16 *dwt_buffer2;
+    sint16 *dwt_buffer3;
+    sint16 *dwt_buffer4;
+    sint16 *dwt_buffer5;
+    sint16 *dwt_buffer6;
     rfx_encode_proc rfx_encode;
     rfx_encode_rgb_to_yuv_proc rfx_encode_rgb_to_yuv;
     rfx_encode_argb_to_yuva_proc rfx_encode_argb_to_yuva;
+    rfx_encode_proc rfx_rem_encode;
+
+    struct rfx_rb * rbs[RFX_MAX_RB_X][RFX_MAX_RB_Y];
 
     int got_sse2;
     int got_sse3;
@@ -72,4 +95,7 @@ struct rfxencode
     int got_neon;
 };
 
+void
+rfxcodec_hexdump(const void *p, int len);
+
 #endif
diff --git a/src/rfxencode_compose.c b/src/rfxencode_compose.c
index 0279c90..b07f0c0 100644
--- a/src/rfxencode_compose.c
+++ b/src/rfxencode_compose.c
@@ -33,6 +33,13 @@
 #include "rfxconstants.h"
 #include "rfxencode_tile.h"
 
+#include "rfxencode_quantization.h"
+#include "rfxencode_dwt_rem.h"
+#include "rfxencode_dwt_shift_rem.h"
+#include "rfxencode_diff_rlgr1.h"
+#include "rfxencode_rlgr1.h"
+#include "rfxencode_differential.h"
+
 #define LLOG_LEVEL 1
 #define LLOGLN(_level, _args) \
     do { if (_level < LLOG_LEVEL) { printf _args ; printf("\n"); } } while (0)
@@ -238,6 +245,8 @@ rfx_compose_message_tile_yuv(struct rfxencode *enc, STREAM *s,
     {
         return 1;
     }
+    LLOGLN(10, ("rfx_compose_message_tile_yuv: YLen %d CbLen %d CrLen %d",
+           YLen, CbLen, CrLen));
     end_pos = stream_get_pos(s);
     stream_set_pos(s, start_pos + 2);
     stream_write_uint32(s, 19 + YLen + CbLen + CrLen); /* BlockT.blockLen */
@@ -610,3 +619,346 @@ rfx_compose_message_data(struct rfxencode *enc, STREAM *s,
     }
     return tiles_written;
 }
+
+/******************************************************************************/
+static int
+rfx_pro_compose_message_context(struct rfxencode *enc, STREAM *s)
+{
+    if (stream_get_left(s) < 10)
+    {
+        return 1;
+    }
+    stream_write_uint16(s, PRO_WBT_CONTEXT);
+    stream_write_uint32(s, 10);
+    stream_write_uint8(s, 0); /* ctxId */
+    stream_write_uint16(s, CT_TILE_64x64); /* tileSize */
+    stream_write_uint8(s, RFX_SUBBAND_DIFFING); /* flags */
+    return 0;
+}
+
+/******************************************************************************/
+int
+rfx_pro_compose_message_header(struct rfxencode *enc, STREAM *s)
+{
+    if (rfx_compose_message_sync(enc, s) != 0)
+    {
+        return 1;
+    }
+    if (rfx_pro_compose_message_context(enc, s) != 0)
+    {
+        return 1;
+    }
+    enc->header_processed = 1;
+    return 0;
+}
+
+/******************************************************************************/
+static int
+rfx_pro_compose_message_frame_begin(struct rfxencode *enc, STREAM *s)
+{
+    if (stream_get_left(s) < 12)
+    {
+        return 1;
+    }
+    stream_write_uint16(s, PRO_WBT_FRAME_BEGIN);
+    stream_write_uint32(s, 12);
+    stream_write_uint32(s, enc->frame_idx);
+    stream_write_uint16(s, 1);
+    enc->frame_idx++;
+    return 0;
+}
+
+/******************************************************************************/
+/* coef1 = coef2 - coef3 (QCdt = QCot - QCrb)
+   count zeros in coef1, coef2
+   coef3 = coef2 */
+#define COEF_DIFF_COUNT_COPY(_coef1, _coef2, _coef3, _loop, _count1, _count2) \
+do { _count1 = 0; _count2 = 0; \
+    for (_loop = 0; _loop < 4096 - 81; _loop++) { \
+        _coef1[_loop] = _coef2[_loop] - _coef3[_loop]; \
+        if (_coef1[_loop] == 0) { _count1++; } \
+        if (_coef2[_loop] == 0) { _count2++; } \
+        _coef3[_loop] = _coef2[_loop]; } \
+    while (_loop < 4096) { \
+        _coef1[_loop] = _coef2[_loop] - _coef3[_loop]; \
+        _coef3[_loop] = _coef2[_loop]; _loop++; } \
+} while (0)
+
+/******************************************************************************/
+/* coef1 = coef2 - coef3 (QCdt = QCot - QCrb)
+   count zeros in coef1, coef2 */
+#define COEF_DIFF_COUNT(_coef1, _coef2, _coef3, _loop, _count1, _count2) \
+do { _count1 = 0; _count2 = 0; \
+    for (_loop = 0; _loop < 4096 - 81; _loop++) { \
+        _coef1[_loop] = _coef2[_loop] - _coef3[_loop]; \
+        if (_coef1[_loop] == 0) { _count1++; } \
+        if (_coef2[_loop] == 0) { _count2++; } } \
+    while (_loop < 4096) { \
+        _coef1[_loop] = _coef2[_loop] - _coef3[_loop]; _loop++; } \
+} while (0)
+
+/******************************************************************************/
+/* coef1 = coef2 - coef3 (QCdt = QCot - QCrb)
+   coef3 = coef2 */
+#define COEF_DIFF_COPY(_coef1, _coef2, _coef3, _loop) \
+do { \
+    for (_loop = 0; _loop < 4096; _loop++) { \
+        _coef1[_loop] = _coef2[_loop] - _coef3[_loop]; \
+        _coef3[_loop] = _coef2[_loop]; } \
+} while (0)
+
+/******************************************************************************/
+/* coef1 = coef2 - coef3 (QCdt = QCot - QCrb) */
+#define COEF_DIFF(_coef1, _coef2, _coef3, _loop) \
+do { \
+    for (_loop = 0; _loop < 4096; _loop++) { \
+        _coef1[_loop] = _coef2[_loop] - _coef3[_loop]; } \
+} while (0)
+
+/******************************************************************************/
+static int
+rfx_pro_compose_message_region(struct rfxencode *enc, STREAM *s,
+                               const struct rfx_rect *regions, int num_regions,
+                               const char *buf, int width, int height,
+                               int stride_bytes,
+                               const struct rfx_tile *tiles, int num_tiles,
+                               const char *quants, int num_quants,
+                               int flags)
+{
+    int index;
+    int jndex;
+    int start_pos;
+    int tiles_start_pos;
+    int end_pos;
+    int tiles_written;
+    int x;
+    int y;
+    uint8 quantIdxY;
+    uint8 quantIdxCb;
+    uint8 quantIdxCr;
+    const char *tile_data;
+
+    int y_bytes;
+    int u_bytes;
+    int v_bytes;
+    int tile_start_pos;
+    int tile_end_pos;
+    uint16 xIdx;
+    uint16 yIdx;
+
+    const uint8 *y_buffer;
+    const uint8 *u_buffer;
+    const uint8 *v_buffer;
+    const char *y_quants;
+    const char *u_quants;
+    const char *v_quants;
+
+    struct rfx_rb *rb;
+    int dt_y_zeros;
+    int dt_u_zeros;
+    int dt_v_zeros;
+    int ot_y_zeros;
+    int ot_u_zeros;
+    int ot_v_zeros;
+    int tile_flags;
+    sint16 *dwt_buffer_y;
+    sint16 *dwt_buffer_u;
+    sint16 *dwt_buffer_v;
+
+    if (stream_get_left(s) < 18 + num_regions * 8 + num_quants * 5)
+    {
+        return 1;
+    }
+    if (quants == NULL)
+    {
+        num_quants = 1;
+        quants = (const char *) g_rfx_default_quantization_values;
+    }
+    start_pos = stream_get_pos(s);
+    stream_write_uint16(s, PRO_WBT_REGION);
+    stream_seek_uint32(s); /* blockLen, set later */
+    stream_write_uint8(s, CT_TILE_64x64);
+    stream_write_uint16(s, num_regions);
+    stream_write_uint8(s, num_quants);
+    stream_write_uint8(s, 0); /* numProgQuant */
+    stream_write_uint8(s, RFX_DWT_REDUCE_EXTRAPOLATE); /* flags */
+    stream_write_uint16(s, num_tiles);
+    stream_seek_uint32(s); /* tileDataSize, set later */
+    for (index = 0; index < num_regions; index++)
+    {
+        stream_write_uint16(s, regions[index].x);
+        stream_write_uint16(s, regions[index].y);
+        stream_write_uint16(s, regions[index].cx);
+        stream_write_uint16(s, regions[index].cy);
+    }
+    stream_write(s, quants, num_quants * 5);
+    tiles_start_pos = stream_get_pos(s);
+    tiles_written = 0;
+    for (index = 0; index < num_tiles; index++)
+    {
+        if (stream_get_left(s) < 22)
+        {
+            return 1;
+        }
+        x = tiles[index].x;
+        y = tiles[index].y;
+        quantIdxY = tiles[index].quant_y;
+        quantIdxCb = tiles[index].quant_cb;
+        quantIdxCr = tiles[index].quant_cr;
+        if ((quantIdxY >= num_quants) || (quantIdxCb >= num_quants) ||
+            (quantIdxCr >= num_quants))
+        {
+            return 1;
+        }
+        tile_data = buf + (y << 8) * (stride_bytes >> 8) + (x << 8);
+        xIdx = x / 64;
+        yIdx = y / 64;
+        if ((xIdx >= RFX_MAX_RB_X) || (yIdx >= RFX_MAX_RB_Y))
+        {
+            return 1;
+        }
+        tile_start_pos = stream_get_pos(s);
+        stream_write_uint16(s, PRO_WBT_TILE_SIMPLE);
+        stream_seek_uint32(s); /* set later */
+        stream_write_uint8(s, quantIdxY);
+        stream_write_uint8(s, quantIdxCb);
+        stream_write_uint8(s, quantIdxCr);
+        stream_write_uint16(s, xIdx);
+        stream_write_uint16(s, yIdx);
+        stream_seek(s, 1); /* flags, set later */
+        stream_seek(s, 8); /* yLen, cbLen, crLen, tailLen, set later */
+        y_buffer = (const uint8 *) tile_data;
+        u_buffer = (const uint8 *) (tile_data + RFX_YUV_BTES);
+        v_buffer = (const uint8 *) (tile_data + RFX_YUV_BTES * 2);
+        y_quants = quants + quantIdxY * 5;
+        u_quants = quants + quantIdxCb * 5;
+        v_quants = quants + quantIdxCr * 5;
+        rb = enc->rbs[xIdx][yIdx];
+        if (rb == NULL)
+        {
+            rb = xnew(struct rfx_rb);
+            if (rb == NULL)
+            {
+                return 1;
+            }
+            enc->rbs[xIdx][yIdx] = rb;
+        }
+        rfx_rem_dwt_shift_encode(y_buffer, enc->dwt_buffer1,
+                                 enc->dwt_buffer, y_quants);
+        rfx_rem_dwt_shift_encode(u_buffer, enc->dwt_buffer2,
+                                 enc->dwt_buffer, u_quants);
+        rfx_rem_dwt_shift_encode(v_buffer, enc->dwt_buffer3,
+                                 enc->dwt_buffer, v_quants);
+        COEF_DIFF_COUNT_COPY(enc->dwt_buffer4, enc->dwt_buffer1, rb->y,
+                             jndex, dt_y_zeros, ot_y_zeros);
+        COEF_DIFF_COUNT_COPY(enc->dwt_buffer5, enc->dwt_buffer2, rb->u,
+                             jndex, dt_u_zeros, ot_u_zeros);
+        COEF_DIFF_COUNT_COPY(enc->dwt_buffer6, enc->dwt_buffer3, rb->v,
+                             jndex, dt_v_zeros, ot_v_zeros);
+        if (ot_y_zeros + ot_u_zeros + ot_v_zeros <
+            dt_y_zeros + dt_u_zeros + dt_v_zeros)
+        {
+            LLOGLN(10, ("rfx_pro_compose_message_region: diff"));
+            tile_flags = RFX_TILE_DIFFERENCE;
+            dwt_buffer_y = enc->dwt_buffer4;
+            dwt_buffer_u = enc->dwt_buffer5;
+            dwt_buffer_v = enc->dwt_buffer6;
+        }
+        else
+        {
+            LLOGLN(10, ("rfx_pro_compose_message_region: orig"));
+            tile_flags = 0;
+            dwt_buffer_y = enc->dwt_buffer1;
+            dwt_buffer_u = enc->dwt_buffer2;
+            dwt_buffer_v = enc->dwt_buffer3;
+        }
+        y_bytes = rfx_encode_diff_rlgr1(dwt_buffer_y,
+                                        stream_get_tail(s),
+                                        stream_get_left(s), 81);
+        if (y_bytes < 0)
+        {
+            return 1;
+        }
+        stream_seek(s, y_bytes);
+        u_bytes = rfx_encode_diff_rlgr1(dwt_buffer_u,
+                                        stream_get_tail(s),
+                                        stream_get_left(s), 81);
+        if (u_bytes < 0)
+        {
+            return 1;
+        }
+        stream_seek(s, u_bytes);
+        v_bytes = rfx_encode_diff_rlgr1(dwt_buffer_v,
+                                        stream_get_tail(s),
+                                        stream_get_left(s), 81);
+        if (v_bytes < 0)
+        {
+            return 1;
+        }
+        stream_seek(s, v_bytes);
+        LLOGLN(10, ("rfx_pro_compose_message_region: y_bytes %d "
+               "u_bytes %d v_bytes %d", y_bytes, u_bytes, v_bytes));
+        tile_end_pos = stream_get_pos(s);
+        stream_set_pos(s, tile_start_pos + 2);
+        stream_write_uint32(s, tile_end_pos - tile_start_pos); /* blockLen */
+        stream_set_pos(s, tile_start_pos + 13);
+        stream_write_uint8(s, tile_flags); /* flags */
+        stream_write_uint16(s, y_bytes); /* yLen */
+        stream_write_uint16(s, u_bytes); /* cbLen */
+        stream_write_uint16(s, v_bytes); /* crLen */
+        stream_write_uint16(s, 0); /* tailLen */
+        stream_set_pos(s, tile_end_pos);
+        ++tiles_written;
+    }
+    end_pos = stream_get_pos(s);
+    stream_set_pos(s, start_pos + 2);
+    stream_write_uint32(s, end_pos - start_pos); /* blockLen */
+    stream_set_pos(s, start_pos + 14);
+    stream_write_uint32(s, end_pos - tiles_start_pos); /* tileDataSize */
+    stream_set_pos(s, end_pos);
+    return tiles_written;
+}
+
+/******************************************************************************/
+static int
+rfx_pro_compose_message_frame_end(struct rfxencode *enc, STREAM *s)
+{
+    if (stream_get_left(s) < 6)
+    {
+        return 1;
+    }
+    stream_write_uint16(s, PRO_WBT_FRAME_END);
+    stream_write_uint32(s, 6);
+    return 0;
+}
+
+/******************************************************************************/
+int
+rfx_pro_compose_message_data(struct rfxencode *enc, STREAM *s,
+                             const struct rfx_rect *regions, int num_regions,
+                             const char *buf, int width, int height,
+                             int stride_bytes,
+                             const struct rfx_tile *tiles, int num_tiles,
+                             const char *quants, int num_quants,
+                             int flags)
+{
+    int tiles_written;
+    LLOGLN(10, ("rfx_pro_compose_message_data:"));
+    if (rfx_pro_compose_message_frame_begin(enc, s) != 0)
+    {
+        return -1;
+    }
+    tiles_written = rfx_pro_compose_message_region(enc, s, regions, num_regions,
+                                   buf, width, height, stride_bytes,
+                                   tiles, num_tiles, quants, num_quants,
+                                   flags);
+    if (tiles_written <= 0)
+    {
+        return -1;
+    }
+    if (rfx_pro_compose_message_frame_end(enc, s) != 0)
+    {
+        return -1;
+    }
+    return tiles_written;
+}
diff --git a/src/rfxencode_compose.h b/src/rfxencode_compose.h
index 6fde0f3..1bafb22 100644
--- a/src/rfxencode_compose.h
+++ b/src/rfxencode_compose.h
@@ -31,4 +31,14 @@ rfx_compose_message_data(struct rfxencode *enc, STREAM *s,
                          const struct rfx_tile *tiles, int num_tiles,
                          const char *quants, int num_quants, int flags);
 
+int
+rfx_pro_compose_message_header(struct rfxencode *enc, STREAM *s);
+int
+rfx_pro_compose_message_data(struct rfxencode *enc, STREAM *s,
+                             const struct rfx_rect *regions, int num_regions,
+                             const char *buf, int width, int height,
+                             int stride_bytes,
+                             const struct rfx_tile *tiles, int num_tiles,
+                             const char *quants, int num_quants, int flags);
+
 #endif
diff --git a/src/rfxencode_diff_rlgr1.c b/src/rfxencode_diff_rlgr1.c
index cd09f18..345108f 100644
--- a/src/rfxencode_diff_rlgr1.c
+++ b/src/rfxencode_diff_rlgr1.c
@@ -52,6 +52,11 @@
 #define CheckWrite do { \
     while (bit_count >= 8) \
     { \
+        if (cdata_size < 1) \
+        { \
+            return -1; \
+        } \
+        cdata_size--; \
         bit_count -= 8; \
         *cdata = bits >> bit_count; \
         cdata++; \
@@ -97,7 +102,8 @@
 } while (0)
 
 int
-rfx_encode_diff_rlgr1(sint16 *coef, uint8 *cdata, int cdata_size)
+rfx_encode_diff_rlgr1(sint16 *coef, uint8 *cdata, int cdata_size,
+                      int diff_bytes)
 {
     int k;
     int kp;
@@ -119,8 +125,8 @@ rfx_encode_diff_rlgr1(sint16 *coef, uint8 *cdata, int cdata_size)
 
     uint32 twoMs;
 
-    /* the last 64 bytes are diff */
-    for (k = PIXELS_IN_TILE - 1; k > PIXELS_IN_TILE - 64; k--)
+    /* the last x bytes are diff */
+    for (k = PIXELS_IN_TILE - 1; k > PIXELS_IN_TILE - diff_bytes; k--)
     {
         coef[k] -= coef[k - 1];
     }
@@ -147,9 +153,13 @@ rfx_encode_diff_rlgr1(sint16 *coef, uint8 *cdata, int cdata_size)
             numZeros = 0;
 
             GetNextInput;
-            while (input == 0 && coef_size > 0)
+            while (input == 0)
             {
                 numZeros++;
+                if (coef_size < 1)
+                {
+                    break;
+                }
                 GetNextInput;
             }
 
@@ -183,6 +193,11 @@ rfx_encode_diff_rlgr1(sint16 *coef, uint8 *cdata, int cdata_size)
 
             CheckWrite;
 
+            if (input == 0)
+            {
+                continue;
+            }
+
             /* encode the nonzero value using GR coding */
             if (input < 0)
             {
@@ -199,7 +214,7 @@ rfx_encode_diff_rlgr1(sint16 *coef, uint8 *cdata, int cdata_size)
             bits |= sign;
             bit_count++;
 
-            lmag = mag ? mag - 1 : 0;
+            lmag = mag - 1;
 
             CodeGR(krp, lmag); /* output GR code for (mag - 1) */
             CheckWrite;
@@ -239,6 +254,10 @@ rfx_encode_diff_rlgr1(sint16 *coef, uint8 *cdata, int cdata_size)
 
     if (bit_count > 0)
     {
+        if (cdata_size < 1)
+        {
+            return -1;
+        }
         bits <<= 8 - bit_count;
         *cdata = bits;
         cdata++;
@@ -249,4 +268,3 @@ rfx_encode_diff_rlgr1(sint16 *coef, uint8 *cdata, int cdata_size)
 
     return processed_size;
 }
-
diff --git a/src/rfxencode_diff_rlgr1.h b/src/rfxencode_diff_rlgr1.h
index 796f8f2..b123766 100644
--- a/src/rfxencode_diff_rlgr1.h
+++ b/src/rfxencode_diff_rlgr1.h
@@ -23,7 +23,8 @@
 #include "rfxcommon.h"
 
 int
-rfx_encode_diff_rlgr1(sint16 *coef, uint8 *cdata, int cdata_size);
+rfx_encode_diff_rlgr1(sint16 *coef, uint8 *cdata, int cdata_size,
+                      int diff_bytes);
 
 #endif /* __RFX_DIFF_RLGR1_H */
 
diff --git a/src/rfxencode_diff_rlgr3.c b/src/rfxencode_diff_rlgr3.c
index 0b68db3..24bb2e5 100644
--- a/src/rfxencode_diff_rlgr3.c
+++ b/src/rfxencode_diff_rlgr3.c
@@ -52,6 +52,11 @@
 #define CheckWrite do { \
     while (bit_count >= 8) \
     { \
+        if (cdata_size < 1) \
+        { \
+            return -1; \
+        } \
+        cdata_size--; \
         bit_count -= 8; \
         *cdata = bits >> bit_count; \
         cdata++; \
@@ -97,7 +102,8 @@
 } while (0)
 
 int
-rfx_encode_diff_rlgr3(sint16 *coef, uint8 *cdata, int cdata_size)
+rfx_encode_diff_rlgr3(sint16 *coef, uint8 *cdata, int cdata_size,
+                      int diff_bytes)
 {
     int k;
     int kp;
@@ -122,8 +128,8 @@ rfx_encode_diff_rlgr3(sint16 *coef, uint8 *cdata, int cdata_size)
     uint32 sum2Ms;
     uint32 nIdx;
 
-    /* the last 64 bytes are diff */
-    for (k = PIXELS_IN_TILE - 1; k > PIXELS_IN_TILE - 64; k--)
+    /* the last x bytes are diff */
+    for (k = PIXELS_IN_TILE - 1; k > PIXELS_IN_TILE - diff_bytes; k--)
     {
         coef[k] -= coef[k - 1];
     }
@@ -150,9 +156,13 @@ rfx_encode_diff_rlgr3(sint16 *coef, uint8 *cdata, int cdata_size)
             numZeros = 0;
 
             GetNextInput;
-            while (input == 0 && coef_size > 0)
+            while (input == 0)
             {
                 numZeros++;
+                if (coef_size < 1)
+                {
+                    break;
+                }
                 GetNextInput;
             }
 
@@ -186,6 +196,11 @@ rfx_encode_diff_rlgr3(sint16 *coef, uint8 *cdata, int cdata_size)
 
             CheckWrite;
 
+            if (input == 0)
+            {
+                continue;
+            }
+
             /* encode the nonzero value using GR coding */
             if (input < 0)
             {
@@ -202,7 +217,7 @@ rfx_encode_diff_rlgr3(sint16 *coef, uint8 *cdata, int cdata_size)
             bits |= sign;
             bit_count++;
 
-            lmag = mag ? mag - 1 : 0;
+            lmag = mag - 1;
 
             CodeGR(krp, lmag); /* output GR code for (mag - 1) */
             CheckWrite;
@@ -276,6 +291,10 @@ rfx_encode_diff_rlgr3(sint16 *coef, uint8 *cdata, int cdata_size)
 
     if (bit_count > 0)
     {
+        if (cdata_size < 1)
+        {
+            return -1;
+        }
         bits <<= 8 - bit_count;
         *cdata = bits;
         cdata++;
diff --git a/src/rfxencode_diff_rlgr3.h b/src/rfxencode_diff_rlgr3.h
index dd61309..41b8f6a 100644
--- a/src/rfxencode_diff_rlgr3.h
+++ b/src/rfxencode_diff_rlgr3.h
@@ -23,7 +23,8 @@
 #include "rfxcommon.h"
 
 int
-rfx_encode_diff_rlgr3(sint16 *coef, uint8 *cdata, int cdata_size);
+rfx_encode_diff_rlgr3(sint16 *coef, uint8 *cdata, int cdata_size,
+                      int diff_bytes);
 
 #endif /* __RFX_DIFF_RLGR3_H */
 
diff --git a/src/rfxencode_dwt.c b/src/rfxencode_dwt.c
index 5d82044..4984858 100644
--- a/src/rfxencode_dwt.c
+++ b/src/rfxencode_dwt.c
@@ -27,12 +27,14 @@
 #include <string.h>
 
 #include "rfxcommon.h"
+#include "rfxencode_dwt.h"
 
 /******************************************************************************/
 static int
-rfx_dwt_2d_encode_horz(sint16 *buffer, sint16 *dwt, int subband_width)
+rfx_dwt_2d_encode_horz(const sint16 *in_buffer, sint16 *out_buffer,
+                       int subband_width)
 {
-    sint16 *l_src, *h_src;
+    const sint16 *l_src, *h_src;
     sint16 *hl, *lh, *hh, *ll;
     int x, y;
     int n;
@@ -42,13 +44,13 @@ rfx_dwt_2d_encode_horz(sint16 *buffer, sint16 *dwt, int subband_width)
     /* The lower part L generates LL(3) and HL(0). */
     /* The higher part H generates LH(1) and HH(2). */
 
-    ll = buffer + subband_width * subband_width * 3;
-    hl = buffer;
-    l_src = dwt;
+    ll = out_buffer + subband_width * subband_width * 3;
+    hl = out_buffer;
+    l_src = in_buffer;
 
-    lh = buffer + subband_width * subband_width;
-    hh = buffer + subband_width * subband_width * 2;
-    h_src = dwt + subband_width * subband_width * 2;
+    lh = out_buffer + subband_width * subband_width;
+    hh = out_buffer + subband_width * subband_width * 2;
+    h_src = in_buffer + subband_width * subband_width * 2;
 
     for (y = 0; y < subband_width; y++)
     {
@@ -102,7 +104,8 @@ rfx_dwt_2d_encode_horz(sint16 *buffer, sint16 *dwt, int subband_width)
 
 /******************************************************************************/
 static int
-rfx_dwt_2d_encode_block(sint16 *buffer, sint16 *dwt, int subband_width)
+rfx_dwt_2d_encode_block(sint16 *in_out_buffer, sint16 *tmp_buffer,
+                        int subband_width)
 {
     sint16 *src, *l, *h;
     int total_width;
@@ -112,14 +115,14 @@ rfx_dwt_2d_encode_block(sint16 *buffer, sint16 *dwt, int subband_width)
     total_width = subband_width << 1;
 
     /* DWT in vertical direction, results in 2 sub-bands in L, H order in
-     * tmp buffer dwt. */
+     * tmp buffer. */
     for (x = 0; x < total_width; x++)
     {
 
         /* pre */
-        l = dwt + x;
+        l = tmp_buffer + x;
         h = l + subband_width * total_width;
-        src = buffer + x;
+        src = in_out_buffer + x;
         *h = (src[total_width] - ((src[0] + src[2 * total_width]) >> 1)) >> 1;
         *l = src[0] + (*h);
 
@@ -127,9 +130,9 @@ rfx_dwt_2d_encode_block(sint16 *buffer, sint16 *dwt, int subband_width)
         for (n = 1; n < subband_width - 1; n++)
         {
             y = n << 1;
-            l = dwt + n * total_width + x;
+            l = tmp_buffer + n * total_width + x;
             h = l + subband_width * total_width;
-            src = buffer + y * total_width + x;
+            src = in_out_buffer + y * total_width + x;
             *h = (src[total_width] - ((src[0] + src[2 * total_width]) >> 1)) >> 1;
             *l = src[0] + ((*(h - total_width) + *h) >> 1);
         }
@@ -137,21 +140,22 @@ rfx_dwt_2d_encode_block(sint16 *buffer, sint16 *dwt, int subband_width)
         /* post */
         n = subband_width - 1;
         y = n << 1;
-        l = dwt + n * total_width + x;
+        l = tmp_buffer + n * total_width + x;
         h = l + subband_width * total_width;
-        src = buffer + y * total_width + x;
+        src = in_out_buffer + y * total_width + x;
         *h = (src[total_width] - ((src[0] + src[0]) >> 1)) >> 1;
         *l = src[0] + ((*(h - total_width) + *h) >> 1);
 
     }
 
-    return rfx_dwt_2d_encode_horz(buffer, dwt, subband_width);
+    return rfx_dwt_2d_encode_horz(tmp_buffer, in_out_buffer, subband_width);
 }
 
 /******************************************************************************/
 static int
 rfx_dwt_2d_encode_block8(const uint8 *in_buffer,
-                         sint16 *buffer, sint16 *dwt, int subband_width)
+                         sint16 *out_buffer, sint16 *tmp_buffer,
+                         int subband_width)
 {
     const uint8 *src;
     sint16 *l, *h;
@@ -163,12 +167,12 @@ rfx_dwt_2d_encode_block8(const uint8 *in_buffer,
     total_width = subband_width << 1;
 
     /* DWT in vertical direction, results in 2 sub-bands in L, H order in
-     * tmp buffer dwt. */
+     * tmp buffer. */
     for (x = 0; x < total_width; x++)
     {
 
         /* pre */
-        l = dwt + x;
+        l = tmp_buffer + x;
         h = l + subband_width * total_width;
         src = in_buffer + x;
         s1 = (src[total_width] - 128) << DWT_FACTOR;
@@ -182,7 +186,7 @@ rfx_dwt_2d_encode_block8(const uint8 *in_buffer,
         for (n = 1; n < subband_width - 1; n++)
         {
             y = n << 1;
-            l = dwt + n * total_width + x;
+            l = tmp_buffer + n * total_width + x;
             h = l + subband_width * total_width;
             src = in_buffer + y * total_width + x;
             s1 = (src[total_width] - 128) << DWT_FACTOR;
@@ -196,7 +200,7 @@ rfx_dwt_2d_encode_block8(const uint8 *in_buffer,
         /* post */
         n = subband_width - 1;
         y = n << 1;
-        l = dwt + n * total_width + x;
+        l = tmp_buffer + n * total_width + x;
         h = l + subband_width * total_width;
         src = in_buffer + y * total_width + x;
         s1 = (src[total_width] - 128) << DWT_FACTOR;
@@ -208,15 +212,16 @@ rfx_dwt_2d_encode_block8(const uint8 *in_buffer,
 
     }
 
-    return rfx_dwt_2d_encode_horz(buffer, dwt, subband_width);
+    return rfx_dwt_2d_encode_horz(tmp_buffer, out_buffer, subband_width);
 }
 
 /******************************************************************************/
 int
-rfx_dwt_2d_encode(const uint8 *in_buffer, sint16 *buffer, sint16 *dwt_buffer)
+rfx_dwt_2d_encode(const uint8 *in_buffer, sint16 *out_buffer,
+                  sint16 *tmp_buffer)
 {
-    rfx_dwt_2d_encode_block8(in_buffer, buffer, dwt_buffer, 32);
-    rfx_dwt_2d_encode_block(buffer + 3072, dwt_buffer, 16);
-    rfx_dwt_2d_encode_block(buffer + 3840, dwt_buffer, 8);
+    rfx_dwt_2d_encode_block8(in_buffer, out_buffer, tmp_buffer, 32);
+    rfx_dwt_2d_encode_block(out_buffer + 3072, tmp_buffer, 16);
+    rfx_dwt_2d_encode_block(out_buffer + 3840, tmp_buffer, 8);
     return 0;
 }
diff --git a/src/rfxencode_dwt.h b/src/rfxencode_dwt.h
index 2d91176..0a82650 100644
--- a/src/rfxencode_dwt.h
+++ b/src/rfxencode_dwt.h
@@ -16,10 +16,11 @@
  * limitations under the License.
  */
 
-#ifndef __RFXENCODE_RFX_H
-#define __RFXENCODE_RFX_H
+#ifndef __RFXENCODE_RFX_DWT_H
+#define __RFXENCODE_RFX_DWT_H
 
 int
-rfx_dwt_2d_encode(const uint8 *in_buffer, sint16 *buffer, sint16 *dwt_buffer);
+rfx_dwt_2d_encode(const uint8 *in_buffer, sint16 *out_buffer,
+                  sint16 *tmp_buffer);
 
 #endif
diff --git a/src/rfxencode_dwt_rem.c b/src/rfxencode_dwt_rem.c
new file mode 100644
index 0000000..7510512
--- /dev/null
+++ b/src/rfxencode_dwt_rem.c
@@ -0,0 +1,542 @@
+/**
+ * RemoteFX Codec Library
+ *
+ * Copyright 2020 Jay Sorg <jay.sorg@gmail.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * DWT Reduce-Extrapolate Method MS-RDPEGFX 3.2.8.1.2.2
+ * also does Quantization and Linearization 3.2.8.1.3
+ */
+
+#if defined(HAVE_CONFIG_H)
+#include <config_ac.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "rfxcommon.h"
+#include "rfxencode_dwt_rem.h"
+
+#define ICL1(_offset) (ic[(_offset) * 64] - 128) << DWT_FACTOR
+#define ICL2(_offset) ic[(_offset) * 33]
+#define ICL3(_offset) ic[(_offset) * 17]
+
+#define LOL1(_offset) lo[(_offset) * 64]
+#define HIL1(_offset) hi[(_offset) * 64]
+#define LOL2(_offset) lo[(_offset) * 33]
+#define HIL2(_offset) hi[(_offset) * 33]
+#define LOL3(_offset) lo[(_offset) * 17]
+#define HIL3(_offset) hi[(_offset) * 17]
+
+/******************************************************************************/
+static void
+rfx_rem_dwt_encode_vert_lv1(const uint8 *in_buffer, sint16 *out_buffer)
+{
+    const uint8 *ic; /* input coefficients */
+    sint16 *lo;
+    sint16 *hi;
+    sint16 x2n;     /* n[2n]     */
+    sint16 x2n1;    /* n[2n + 1] */
+    sint16 x2n2;    /* n[2n + 2] */
+    sint16 hn1;     /* H[n - 1]  */
+    sint16 hn;      /* H[n]      */
+    sint16 ic62;
+    int n;
+    int y;
+
+    for (y = 0; y < 64; y++)
+    {
+
+        /* setup */
+        ic = in_buffer + y;
+        lo = out_buffer + y;
+        hi = lo + 64 * 33;
+
+        /* pre */
+        x2n = ICL1(0);
+        x2n1 = ICL1(1);
+        x2n2 = ICL1(2);
+        HIL1(0) = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        LOL1(0) = x2n + hn; /* mirror */
+
+        /* loop */
+        for (n = 1; n < 31; n++)
+        {
+            hn1 = hn;
+            x2n = x2n2;
+            x2n1 = ICL1(2 * n + 1);
+            x2n2 = ICL1(2 * n + 2);
+            HIL1(n) = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+            LOL1(n) = x2n + ((hn1 + hn) >> 1);
+        }
+
+        /* post */
+        hn1 = hn;
+        ic62 = x2n = x2n2;
+        x2n1 = ICL1(63);
+        x2n2 = 2 * x2n1 - x2n; /* ic[64] = 2 * ic[63] - ic[62] */
+        LOL1(31) = x2n + (hn1 >> 1);
+
+        x2n = x2n2;
+        /* x2n1 already set, mirror 65 -> 63 */
+        x2n2 = ic62;      /* mirror 66 -> 62 */
+        hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        LOL1(32) = x2n + (hn >> 1);
+
+    }
+}
+
+/******************************************************************************/
+static void
+rfx_rem_dwt_encode_horz_lv1(const sint16 *in_buffer, sint16 *out_buffer)
+{
+    const sint16 *ic; /* input coefficients */
+    sint16 *lo;
+    sint16 *hi;
+    sint16 x2n;     /* n[2n]     */
+    sint16 x2n1;    /* n[2n + 1] */
+    sint16 x2n2;    /* n[2n + 2] */
+    sint16 hn1;     /* H[n - 1]  */
+    sint16 hn;      /* H[n]      */
+    sint16 ic62;
+    int n;
+    int y;
+
+    for (y = 0; y < 33; y++) /* lo */
+    {
+
+        /* setup */
+        ic = in_buffer + 64 * y;
+        lo = out_buffer + 31 * 33 + 33 * 31 + 31 * 31 + 33 * y; /* LL1 */
+        hi = out_buffer + 31 * y; /* HL1 */
+
+        /* pre */
+        x2n = ic[0];
+        x2n1 = ic[1];
+        x2n2 = ic[2];
+        hi[0] = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[0] = x2n + hn; /* mirror */
+
+        /* loop */
+        for (n = 1; n < 31; n++)
+        {
+            hn1 = hn;
+            x2n = x2n2;
+            x2n1 = ic[2 * n + 1];
+            x2n2 = ic[2 * n + 2];
+            hi[n] = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+            lo[n] = x2n + ((hn1 + hn) >> 1);
+        }
+
+        /* post */
+        hn1 = hn;
+        ic62 = x2n = x2n2;
+        x2n1 = ic[63];
+        x2n2 = 2 * x2n1 - x2n; /* ic[64] = 2 * ic[63] - ic[62] */
+        lo[31] = x2n + (hn1 >> 1);
+
+        x2n = x2n2;
+        /* x2n1 already set, mirror 65 -> 63 */
+        x2n2 = ic62;      /* mirror 66 -> 62 */
+        hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[32] = x2n + (hn >> 1);
+
+    }
+
+    for (y = 0; y < 31; y++) /* hi */
+    {
+
+        /* setup */
+        ic = in_buffer + 64 * (33 + y);
+        lo = out_buffer + 31 * 33 + 33 * y; /* LH1 */
+        hi = out_buffer + 31 * 33 + 33 * 31 + 31 * y; /* HH1 */
+
+        /* pre */
+        x2n = ic[0];
+        x2n1 = ic[1];
+        x2n2 = ic[2];
+        hi[0] = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[0] = x2n + hn;
+
+        /* loop */
+        for (n = 1; n < 31; n++)
+        {
+            hn1 = hn;
+            x2n = x2n2;
+            x2n1 = ic[2 * n + 1];
+            x2n2 = ic[2 * n + 2];
+            hi[n] = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+            lo[n] = x2n + ((hn1 + hn) >> 1);
+        }
+
+        /* post */
+        hn1 = hn;
+        ic62 = x2n = x2n2;
+        x2n1 = ic[63];
+        x2n2 = 2 * x2n1 - x2n; /* ic[64] = 2 * ic[63] - ic[62] */
+        lo[31] = x2n + (hn1 >> 1);
+
+        x2n = x2n2;
+        /* x2n1 already set, mirror 65 -> 63 */
+        x2n2 = ic62;      /* mirror 66 -> 62 */
+        hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[32] = x2n + (hn >> 1);
+
+    }
+
+}
+
+/******************************************************************************/
+static void
+rfx_rem_dwt_encode_vert_lv2(const sint16 *in_buffer, sint16 *out_buffer)
+{
+    const sint16 *ic; /* input coefficients */
+    sint16 *lo;
+    sint16 *hi;
+    sint16 x2n;     /* n[2n]     */
+    sint16 x2n1;    /* n[2n + 1] */
+    sint16 x2n2;    /* n[2n + 2] */
+    sint16 hn1;     /* H[n - 1]  */
+    sint16 hn;      /* H[n]      */
+    sint16 ic30;
+    int n;
+    int y;
+
+    for (y = 0; y < 33; y++)
+    {
+
+        /* setup */
+        ic = in_buffer + y;
+        lo = out_buffer + y;
+        hi = lo + 33 * 17;
+
+        /* pre */
+        x2n = ICL2(0);
+        x2n1 = ICL2(1);
+        x2n2 = ICL2(2);
+        HIL2(0) = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        LOL2(0) = x2n + hn; /* mirror */
+
+        /* loop */
+        for (n = 1; n < 15; n++)
+        {
+            hn1 = hn;
+            x2n = x2n2;
+            x2n1 = ICL2(2 * n + 1);
+            x2n2 = ICL2(2 * n + 2);
+            HIL2(n) = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+            LOL2(n) = x2n + ((hn1 + hn) >> 1);
+        }
+
+        /* post */
+        hn1 = hn;
+        ic30 = x2n = x2n2;
+        x2n1 = ICL2(31);
+        x2n2 = ICL2(32);
+        HIL2(15) = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        LOL2(15) = x2n + ((hn1 + hn) >> 1);
+
+        hn1 = hn;
+        x2n = x2n2;
+        /* x2n1 already set, mirror 33 -> 31 */
+        x2n2 = ic30;      /* mirror 34 -> 30 */
+        hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        LOL2(16) = x2n + ((hn1 + hn) >> 1);
+
+    }
+}
+
+/******************************************************************************/
+static void
+rfx_rem_dwt_encode_horz_lv2(const sint16 *in_buffer, sint16 *out_buffer)
+{
+    const sint16 *ic; /* input coefficients */
+    sint16 *lo;
+    sint16 *hi;
+    sint16 x2n;     /* n[2n]     */
+    sint16 x2n1;    /* n[2n + 1] */
+    sint16 x2n2;    /* n[2n + 2] */
+    sint16 hn1;     /* H[n - 1]  */
+    sint16 hn;      /* H[n]      */
+    sint16 ic30;
+    int n;
+    int y;
+
+    for (y = 0; y < 17; y++) /* lo */
+    {
+
+        /* setup */
+        ic = in_buffer + 33 * y;
+        lo = out_buffer + 16 * 17 + 17 * 16 + 16 * 16 + 17 * y; /* LL2 */
+        hi = out_buffer + 16 * y; /* HL2 */
+
+        /* pre */
+        x2n = ic[0];
+        x2n1 = ic[1];
+        x2n2 = ic[2];
+        hi[0] = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[0] = x2n + hn;
+
+        /* loop */
+        for (n = 1; n < 15; n++)
+        {
+            hn1 = hn;
+            x2n = x2n2;
+            x2n1 = ic[2 * n + 1];
+            x2n2 = ic[2 * n + 2];
+            hi[n] = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+            lo[n] = x2n + ((hn1 + hn) >> 1);
+        }
+
+        /* post */
+        hn1 = hn;
+        ic30 = x2n = x2n2;
+        x2n1 = ic[31];
+        x2n2 = ic[32];
+        hi[15] = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[15] = x2n + ((hn1 + hn) >> 1);
+
+        hn1 = hn;
+        x2n = x2n2;
+        /* x2n1 already set, mirror 33 -> 31 */
+        x2n2 = ic30;      /* mirror 34 -> 30 */
+        hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[16] = x2n + ((hn1 + hn) >> 1);
+
+    }
+
+    for (y = 0; y < 16; y++) /* hi */
+    {
+
+        /* setup */
+        ic = in_buffer + 33 * (17 + y);
+        lo = out_buffer + 16 * 17 + 17 * y; /* LH2 */
+        hi = out_buffer + 16 * 17 + 17 * 16 + 16 * y; /* HH2 */
+
+        /* pre */
+        x2n = ic[0];
+        x2n1 = ic[1];
+        x2n2 = ic[2];
+        hi[0] = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[0] = x2n + hn;
+
+        /* loop */
+        for (n = 1; n < 15; n++)
+        {
+            hn1 = hn;
+            x2n = x2n2;
+            x2n1 = ic[2 * n + 1];
+            x2n2 = ic[2 * n + 2];
+            hi[n] = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+            lo[n] = x2n + ((hn1 + hn) >> 1);
+        }
+
+        /* post */
+        hn1 = hn;
+        ic30 = x2n = x2n2;
+        x2n1 = ic[31];
+        x2n2 = ic[32];
+        hi[15] = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[15] = x2n + ((hn1 + hn) >> 1);
+
+        hn1 = hn;
+        x2n = x2n2;
+        /* x2n1 already set, mirror 33 -> 31 */
+        x2n2 = ic30;      /* mirror 34 -> 30 */
+        hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[16] = x2n + ((hn1 + hn) >> 1);
+
+    }
+
+}
+
+/******************************************************************************/
+static void
+rfx_rem_dwt_encode_vert_lv3(const sint16 *in_buffer, sint16 *out_buffer)
+{
+    const sint16 *ic; /* input coefficients */
+    sint16 *lo;
+    sint16 *hi;
+    sint16 x2n;     /* n[2n]     */
+    sint16 x2n1;    /* n[2n + 1] */
+    sint16 x2n2;    /* n[2n + 2] */
+    sint16 hn1;     /* H[n - 1]  */
+    sint16 hn;      /* H[n]      */
+    sint16 ic14;
+    int n;
+    int y;
+
+    for (y = 0; y < 17; y++)
+    {
+
+        /* setup */
+        ic = in_buffer + y;
+        lo = out_buffer + y;
+        hi = lo + 17 * 9;
+
+        /* pre */
+        x2n = ICL3(0);
+        x2n1 = ICL3(1);
+        x2n2 = ICL3(2);
+        HIL3(0) = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        LOL3(0) = x2n + hn; /* mirror */
+
+        /* loop */
+        for (n = 1; n < 7; n++)
+        {
+            hn1 = hn;
+            x2n = x2n2;
+            x2n1 = ICL3(2 * n + 1);
+            x2n2 = ICL3(2 * n + 2);
+            HIL3(n) = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+            LOL3(n) = x2n + ((hn1 + hn) >> 1);
+        }
+
+        /* post */
+        hn1 = hn;
+        ic14 = x2n = x2n2;
+        x2n1 = ICL3(15);
+        x2n2 = ICL3(16);
+        HIL3(7) = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        LOL3(7) = x2n + ((hn1 + hn) >> 1);
+
+        hn1 = hn;
+        x2n = x2n2;
+        /* x2n1 already set, mirror 17 -> 15 */
+        x2n2 = ic14;      /* mirror 18 -> 14 */
+        hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        LOL3(8) = x2n + ((hn1 + hn) >> 1);
+
+    }
+}
+
+/******************************************************************************/
+static void
+rfx_rem_dwt_encode_horz_lv3(const sint16 *in_buffer, sint16 *out_buffer)
+{
+    const sint16 *ic; /* input coefficients */
+    sint16 *lo;
+    sint16 *hi;
+    sint16 x2n;     /* n[2n]     */
+    sint16 x2n1;    /* n[2n + 1] */
+    sint16 x2n2;    /* n[2n + 2] */
+    sint16 hn1;     /* H[n - 1]  */
+    sint16 hn;      /* H[n]      */
+    sint16 ic14;
+    int n;
+    int y;
+
+    for (y = 0; y < 9; y++) /* lo */
+    {
+
+        /* setup */
+        ic = in_buffer + 17 * y;
+        lo = out_buffer + 8 * 9 + 9 * 8 + 8 * 8 + 9 * y; /* LL3 */
+        hi = out_buffer + 8 * y; /* HL3 */
+
+        /* pre */
+        x2n = ic[0];
+        x2n1 = ic[1];
+        x2n2 = ic[2];
+        hi[0] = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[0] = x2n + hn; /* mirror */
+
+        /* loop */
+        for (n = 1; n < 7; n++)
+        {
+            hn1 = hn;
+            x2n = x2n2;
+            x2n1 = ic[2 * n + 1];
+            x2n2 = ic[2 * n + 2];
+            hi[n] = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+            lo[n] = x2n + ((hn1 + hn) >> 1);
+        }
+
+        /* post */
+        hn1 = hn;
+        ic14 = x2n = x2n2;
+        x2n1 = ic[15];
+        x2n2 = ic[16];
+        hi[7] = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[7] = x2n + ((hn1 + hn) >> 1);
+
+        hn1 = hn;
+        x2n = x2n2;
+        /* x2n1 already set, mirror 17 -> 15 */
+        x2n2 = ic14;      /* mirror 18 -> 14 */
+        hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[8] = x2n + ((hn1 + hn) >> 1);
+
+    }
+
+    for (y = 0; y < 8; y++) /* hi */
+    {
+
+        /* setup */
+        ic = in_buffer + 17 * (9 + y);
+        lo = out_buffer + 8 * 9 + 9 * y; /* LH3 */
+        hi = out_buffer + 8 * 9 + 9 * 8 + 8 * y; /* HH3 */
+
+        /* pre */
+        x2n = ic[0];
+        x2n1 = ic[1];
+        x2n2 = ic[2];
+        hi[0] = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[0] = x2n + hn; /* mirror */
+
+        /* loop */
+        for (n = 1; n < 7; n++)
+        {
+            hn1 = hn;
+            x2n = x2n2;
+            x2n1 = ic[2 * n + 1];
+            x2n2 = ic[2 * n + 2];
+            hi[n] = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+            lo[n] = x2n + ((hn1 + hn) >> 1);
+        }
+
+        /* post */
+        hn1 = hn;
+        ic14 = x2n = x2n2;
+        x2n1 = ic[15];
+        x2n2 = ic[16];
+        hi[7] = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[7] = x2n + ((hn1 + hn) >> 1);
+
+        hn1 = hn;
+        x2n = x2n2;
+        /* x2n1 already set, mirror 17 -> 15 */
+        x2n2 = ic14;      /* mirror 18 -> 14 */
+        hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[8] = x2n + ((hn1 + hn) >> 1);
+
+    }
+
+}
+
+/******************************************************************************/
+int
+rfx_rem_dwt_encode(const uint8 *in_buffer, sint16 *out_buffer,
+                   sint16 *tmp_buffer)
+{
+    rfx_rem_dwt_encode_vert_lv1(in_buffer, tmp_buffer);
+    rfx_rem_dwt_encode_horz_lv1(tmp_buffer, out_buffer);
+    rfx_rem_dwt_encode_vert_lv2(out_buffer + 3007, tmp_buffer);
+    rfx_rem_dwt_encode_horz_lv2(tmp_buffer, out_buffer + 3007);
+    rfx_rem_dwt_encode_vert_lv3(out_buffer + 3807, tmp_buffer);
+    rfx_rem_dwt_encode_horz_lv3(tmp_buffer, out_buffer + 3807);
+    return 0;
+}
diff --git a/src/rfxencode_dwt_rem.h b/src/rfxencode_dwt_rem.h
new file mode 100644
index 0000000..7171f86
--- /dev/null
+++ b/src/rfxencode_dwt_rem.h
@@ -0,0 +1,26 @@
+/**
+ * RFX codec encoder
+ *
+ * Copyright 2020 Jay Sorg <jay.sorg@gmail.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RFXENCODE_RFX_REM_DWT_H
+#define __RFXENCODE_RFX_REM_DWT_H
+
+int
+rfx_rem_dwt_encode(const uint8 *in_buffer, sint16 *out_buffer,
+                   sint16 *tmp_buffer);
+
+#endif
diff --git a/src/rfxencode_dwt_shift_rem.c b/src/rfxencode_dwt_shift_rem.c
new file mode 100644
index 0000000..fa3ef2a
--- /dev/null
+++ b/src/rfxencode_dwt_shift_rem.c
@@ -0,0 +1,568 @@
+/**
+ * RemoteFX Codec Library
+ *
+ * Copyright 2020 Jay Sorg <jay.sorg@gmail.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * DWT Reduce-Extrapolate Method MS-RDPEGFX 3.2.8.1.2.2
+ * also does Quantization and Linearization 3.2.8.1.3
+ */
+
+#if defined(HAVE_CONFIG_H)
+#include <config_ac.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "rfxcommon.h"
+#include "rfxencode_dwt_rem.h"
+
+#define ICL1(_offset) (ic[(_offset) * 64] - 128) << DWT_FACTOR
+#define ICL2(_offset) ic[(_offset) * 33]
+#define ICL3(_offset) ic[(_offset) * 17]
+
+#define LOL1(_offset) lo[(_offset) * 64]
+#define HIL1(_offset) hi[(_offset) * 64]
+#define LOL2(_offset) lo[(_offset) * 33]
+#define HIL2(_offset) hi[(_offset) * 33]
+#define LOL3(_offset) lo[(_offset) * 17]
+#define HIL3(_offset) hi[(_offset) * 17]
+
+#define SETUPLOQ(_index, _shift) do { \
+    lo_fact = (((quants[_index] >> (_shift)) & 0xf) - 6) + DWT_FACTOR; \
+    lo_half = 1 << (hi_fact - 1); } while (0)
+#define SETUPHIQ(_index, _shift) do { \
+    hi_fact = (((quants[_index] >> (_shift)) & 0xf) - 6) + DWT_FACTOR; \
+    hi_half = 1 << (hi_fact - 1); } while (0)
+#define LOQ(_val) ((_val) + lo_half) >> lo_fact
+#define HIQ(_val) ((_val) + hi_half) >> hi_fact
+
+/******************************************************************************/
+static void
+rfx_rem_dwt_shift_encode_vert_lv1(const uint8 *in_buffer, sint16 *out_buffer)
+{
+    const uint8 *ic; /* input coefficients */
+    sint16 *lo;
+    sint16 *hi;
+    sint16 x2n;     /* n[2n]     */
+    sint16 x2n1;    /* n[2n + 1] */
+    sint16 x2n2;    /* n[2n + 2] */
+    sint16 hn1;     /* H[n - 1]  */
+    sint16 hn;      /* H[n]      */
+    sint16 ic62;
+    int n;
+    int y;
+
+    for (y = 0; y < 64; y++)
+    {
+
+        /* setup */
+        ic = in_buffer + y;
+        lo = out_buffer + y;
+        hi = lo + 64 * 33;
+
+        /* pre */
+        x2n = ICL1(0);
+        x2n1 = ICL1(1);
+        x2n2 = ICL1(2);
+        HIL1(0) = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        LOL1(0) = x2n + hn; /* mirror */
+
+        /* loop */
+        for (n = 1; n < 31; n++)
+        {
+            hn1 = hn;
+            x2n = x2n2;
+            x2n1 = ICL1(2 * n + 1);
+            x2n2 = ICL1(2 * n + 2);
+            HIL1(n) = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+            LOL1(n) = x2n + ((hn1 + hn) >> 1);
+        }
+
+        /* post */
+        hn1 = hn;
+        ic62 = x2n = x2n2;
+        x2n1 = ICL1(63);
+        x2n2 = 2 * x2n1 - x2n; /* ic[64] = 2 * ic[63] - ic[62] */
+        LOL1(31) = x2n + (hn1 >> 1);
+
+        x2n = x2n2;
+        /* x2n1 already set, mirror 65 -> 63 */
+        x2n2 = ic62;      /* mirror 66 -> 62 */
+        hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        LOL1(32) = x2n + (hn >> 1);
+    }
+}
+
+/******************************************************************************/
+static void
+rfx_rem_dwt_shift_encode_horz_lv1(const sint16 *in_buffer, sint16 *out_buffer,
+                                  const char *quants)
+{
+    const sint16 *ic; /* input coefficients */
+    sint16 *lo;
+    sint16 *hi;
+    sint16 x2n;     /* n[2n]     */
+    sint16 x2n1;    /* n[2n + 1] */
+    sint16 x2n2;    /* n[2n + 2] */
+    sint16 hn1;     /* H[n - 1]  */
+    sint16 hn;      /* H[n]      */
+    sint16 ic62;
+    int n;
+    int y;
+    int lo_fact;
+    int hi_fact;
+    int lo_half;
+    int hi_half;
+
+    SETUPHIQ(4, 0); /* HL1 */
+    for (y = 0; y < 33; y++) /* lo */
+    {
+        /* setup */
+        ic = in_buffer + 64 * y;
+        lo = out_buffer + 31 * 33 + 33 * 31 + 31 * 31 + 33 * y; /* LL1 */
+        hi = out_buffer + 31 * y; /* HL1 */
+
+        /* pre */
+        x2n = ic[0];
+        x2n1 = ic[1];
+        x2n2 = ic[2];
+        hi[0] = HIQ(hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1);
+        lo[0] = x2n + hn; /* mirror */
+
+        /* loop */
+        for (n = 1; n < 31; n++)
+        {
+            hn1 = hn;
+            x2n = x2n2;
+            x2n1 = ic[2 * n + 1];
+            x2n2 = ic[2 * n + 2];
+            hi[n] = HIQ(hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1);
+            lo[n] = x2n + ((hn1 + hn) >> 1);
+        }
+
+        /* post */
+        hn1 = hn;
+        ic62 = x2n = x2n2;
+        x2n1 = ic[63];
+        x2n2 = 2 * x2n1 - x2n; /* ic[64] = 2 * ic[63] - ic[62] */
+        lo[31] = x2n + (hn1 >> 1);
+
+        x2n = x2n2;
+        /* x2n1 already set, mirror 65 -> 63 */
+        x2n2 = ic62;      /* mirror 66 -> 62 */
+        hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[32] = x2n + (hn >> 1);
+
+    }
+
+    SETUPLOQ(3, 4); /* LH1 */
+    SETUPHIQ(4, 4); /* HH1 */
+    for (y = 0; y < 31; y++) /* hi */
+    {
+
+        /* setup */
+        ic = in_buffer + 64 * (33 + y);
+        lo = out_buffer + 31 * 33 + 33 * y; /* LH1 */
+        hi = out_buffer + 31 * 33 + 33 * 31 + 31 * y; /* HH1 */
+
+        /* pre */
+        x2n = ic[0];
+        x2n1 = ic[1];
+        x2n2 = ic[2];
+        hi[0] = HIQ(hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1);
+        lo[0] = LOQ(x2n + hn);
+
+        /* loop */
+        for (n = 1; n < 31; n++)
+        {
+            hn1 = hn;
+            x2n = x2n2;
+            x2n1 = ic[2 * n + 1];
+            x2n2 = ic[2 * n + 2];
+            hi[n] = HIQ(hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1);
+            lo[n] = LOQ(x2n + ((hn1 + hn) >> 1));
+        }
+
+        /* post */
+        hn1 = hn;
+        ic62 = x2n = x2n2;
+        x2n1 = ic[63];
+        x2n2 = 2 * x2n1 - x2n; /* ic[64] = 2 * ic[63] - ic[62] */
+        lo[31] = LOQ(x2n + (hn1 >> 1));
+
+        x2n = x2n2;
+        /* x2n1 already set, mirror 65 -> 63 */
+        x2n2 = ic62;      /* mirror 66 -> 62 */
+        hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[32] = LOQ(x2n + (hn >> 1));
+    }
+}
+
+/******************************************************************************/
+static void
+rfx_rem_dwt_shift_encode_vert_lv2(const sint16 *in_buffer, sint16 *out_buffer)
+{
+    const sint16 *ic; /* input coefficients */
+    sint16 *lo;
+    sint16 *hi;
+    sint16 x2n;     /* n[2n]     */
+    sint16 x2n1;    /* n[2n + 1] */
+    sint16 x2n2;    /* n[2n + 2] */
+    sint16 hn1;     /* H[n - 1]  */
+    sint16 hn;      /* H[n]      */
+    sint16 ic30;
+    int n;
+    int y;
+
+    for (y = 0; y < 33; y++)
+    {
+
+        /* setup */
+        ic = in_buffer + y;
+        lo = out_buffer + y;
+        hi = lo + 33 * 17;
+
+        /* pre */
+        x2n = ICL2(0);
+        x2n1 = ICL2(1);
+        x2n2 = ICL2(2);
+        HIL2(0) = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        LOL2(0) = x2n + hn; /* mirror */
+
+        /* loop */
+        for (n = 1; n < 15; n++)
+        {
+            hn1 = hn;
+            x2n = x2n2;
+            x2n1 = ICL2(2 * n + 1);
+            x2n2 = ICL2(2 * n + 2);
+            HIL2(n) = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+            LOL2(n) = x2n + ((hn1 + hn) >> 1);
+        }
+
+        /* post */
+        hn1 = hn;
+        ic30 = x2n = x2n2;
+        x2n1 = ICL2(31);
+        x2n2 = ICL2(32);
+        HIL2(15) = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        LOL2(15) = x2n + ((hn1 + hn) >> 1);
+
+        hn1 = hn;
+        x2n = x2n2;
+        /* x2n1 already set, mirror 33 -> 31 */
+        x2n2 = ic30;      /* mirror 34 -> 30 */
+        hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        LOL2(16) = x2n + ((hn1 + hn) >> 1);
+
+    }
+}
+
+/******************************************************************************/
+static void
+rfx_rem_dwt_shift_encode_horz_lv2(const sint16 *in_buffer, sint16 *out_buffer,
+                                  const char *quants)
+{
+    const sint16 *ic; /* input coefficients */
+    sint16 *lo;
+    sint16 *hi;
+    sint16 x2n;     /* n[2n]     */
+    sint16 x2n1;    /* n[2n + 1] */
+    sint16 x2n2;    /* n[2n + 2] */
+    sint16 hn1;     /* H[n - 1]  */
+    sint16 hn;      /* H[n]      */
+    sint16 ic30;
+    int n;
+    int y;
+    int lo_fact;
+    int hi_fact;
+    int lo_half;
+    int hi_half;
+
+    SETUPHIQ(2, 4); /* HL2 */
+    for (y = 0; y < 17; y++) /* lo */
+    {
+
+        /* setup */
+        ic = in_buffer + 33 * y;
+        lo = out_buffer + 16 * 17 + 17 * 16 + 16 * 16 + 17 * y; /* LL2 */
+        hi = out_buffer + 16 * y; /* HL2 */
+
+        /* pre */
+        x2n = ic[0];
+        x2n1 = ic[1];
+        x2n2 = ic[2];
+        hi[0] = HIQ(hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1);
+        lo[0] = x2n + hn;
+
+        /* loop */
+        for (n = 1; n < 15; n++)
+        {
+            hn1 = hn;
+            x2n = x2n2;
+            x2n1 = ic[2 * n + 1];
+            x2n2 = ic[2 * n + 2];
+            hi[n] = HIQ(hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1);
+            lo[n] = x2n + ((hn1 + hn) >> 1);
+        }
+
+        /* post */
+        hn1 = hn;
+        ic30 = x2n = x2n2;
+        x2n1 = ic[31];
+        x2n2 = ic[32];
+        hi[15] = HIQ(hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1);
+        lo[15] = x2n + ((hn1 + hn) >> 1);
+
+        hn1 = hn;
+        x2n = x2n2;
+        /* x2n1 already set, mirror 33 -> 31 */
+        x2n2 = ic30;      /* mirror 34 -> 30 */
+        hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[16] = x2n + ((hn1 + hn) >> 1);
+
+    }
+
+    SETUPHIQ(3, 0); /* HH2 */
+    SETUPLOQ(2, 0); /* LH2 */
+    for (y = 0; y < 16; y++) /* hi */
+    {
+
+        /* setup */
+        ic = in_buffer + 33 * (17 + y);
+        lo = out_buffer + 16 * 17 + 17 * y; /* LH2 */
+        hi = out_buffer + 16 * 17 + 17 * 16 + 16 * y; /* HH2 */
+
+        /* pre */
+        x2n = ic[0];
+        x2n1 = ic[1];
+        x2n2 = ic[2];
+        hi[0] = HIQ(hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1);
+        lo[0] = LOQ(x2n + hn);
+
+        /* loop */
+        for (n = 1; n < 15; n++)
+        {
+            hn1 = hn;
+            x2n = x2n2;
+            x2n1 = ic[2 * n + 1];
+            x2n2 = ic[2 * n + 2];
+            hi[n] = HIQ(hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1);
+            lo[n] = LOQ(x2n + ((hn1 + hn) >> 1));
+        }
+
+        /* post */
+        hn1 = hn;
+        ic30 = x2n = x2n2;
+        x2n1 = ic[31];
+        x2n2 = ic[32];
+        hi[15] = HIQ(hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1);
+        lo[15] = LOQ(x2n + ((hn1 + hn) >> 1));
+
+        hn1 = hn;
+        x2n = x2n2;
+        /* x2n1 already set, mirror 33 -> 31 */
+        x2n2 = ic30;      /* mirror 34 -> 30 */
+        hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[16] = LOQ(x2n + ((hn1 + hn) >> 1));
+
+    }
+
+}
+
+/******************************************************************************/
+static void
+rfx_rem_dwt_shift_encode_vert_lv3(const sint16 *in_buffer, sint16 *out_buffer)
+{
+    const sint16 *ic; /* input coefficients */
+    sint16 *lo;
+    sint16 *hi;
+    sint16 x2n;     /* n[2n]     */
+    sint16 x2n1;    /* n[2n + 1] */
+    sint16 x2n2;    /* n[2n + 2] */
+    sint16 hn1;     /* H[n - 1]  */
+    sint16 hn;      /* H[n]      */
+    sint16 ic14;
+    int n;
+    int y;
+
+    for (y = 0; y < 17; y++)
+    {
+        /* setup */
+        ic = in_buffer + y;
+        lo = out_buffer + y;
+        hi = lo + 17 * 9;
+
+        /* pre */
+        x2n = ICL3(0);
+        x2n1 = ICL3(1);
+        x2n2 = ICL3(2);
+        HIL3(0) = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        LOL3(0) = x2n + hn; /* mirror */
+
+        /* loop */
+        for (n = 1; n < 7; n++)
+        {
+            hn1 = hn;
+            x2n = x2n2;
+            x2n1 = ICL3(2 * n + 1);
+            x2n2 = ICL3(2 * n + 2);
+            HIL3(n) = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+            LOL3(n) = x2n + ((hn1 + hn) >> 1);
+        }
+
+        /* post */
+        hn1 = hn;
+        ic14 = x2n = x2n2;
+        x2n1 = ICL3(15);
+        x2n2 = ICL3(16);
+        HIL3(7) = hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        LOL3(7) = x2n + ((hn1 + hn) >> 1);
+
+        hn1 = hn;
+        x2n = x2n2;
+        /* x2n1 already set, mirror 17 -> 15 */
+        x2n2 = ic14;      /* mirror 18 -> 14 */
+        hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        LOL3(8) = x2n + ((hn1 + hn) >> 1);
+
+    }
+}
+
+/******************************************************************************/
+static void
+rfx_rem_dwt_shift_encode_horz_lv3(const sint16 *in_buffer, sint16 *out_buffer,
+                                  const char *quants)
+{
+    const sint16 *ic; /* input coefficients */
+    sint16 *lo;
+    sint16 *hi;
+    sint16 x2n;     /* n[2n]     */
+    sint16 x2n1;    /* n[2n + 1] */
+    sint16 x2n2;    /* n[2n + 2] */
+    sint16 hn1;     /* H[n - 1]  */
+    sint16 hn;      /* H[n]      */
+    sint16 ic14;
+    int n;
+    int y;
+    int lo_fact;
+    int hi_fact;
+    int lo_half;
+    int hi_half;
+
+    SETUPHIQ(1, 0); /* HL3 */
+    SETUPLOQ(0, 0); /* LL3 */
+    for (y = 0; y < 9; y++) /* lo */
+    {
+
+        /* setup */
+        ic = in_buffer + 17 * y;
+        lo = out_buffer + 8 * 9 + 9 * 8 + 8 * 8 + 9 * y; /* LL3 */
+        hi = out_buffer + 8 * y; /* HL3 */
+
+        /* pre */
+        x2n = ic[0];
+        x2n1 = ic[1];
+        x2n2 = ic[2];
+        hi[0] = HIQ(hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1);
+        lo[0] = LOQ(x2n + hn); /* mirror */
+
+        /* loop */
+        for (n = 1; n < 7; n++)
+        {
+            hn1 = hn;
+            x2n = x2n2;
+            x2n1 = ic[2 * n + 1];
+            x2n2 = ic[2 * n + 2];
+            hi[n] = HIQ(hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1);
+            lo[n] = LOQ(x2n + ((hn1 + hn) >> 1));
+        }
+
+        /* post */
+        hn1 = hn;
+        ic14 = x2n = x2n2;
+        x2n1 = ic[15];
+        x2n2 = ic[16];
+        hi[7] = HIQ(hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1);
+        lo[7] = LOQ(x2n + ((hn1 + hn) >> 1));
+
+        hn1 = hn;
+        x2n = x2n2;
+        /* x2n1 already set, mirror 17 -> 15 */
+        x2n2 = ic14;      /* mirror 18 -> 14 */
+        hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[8] = LOQ(x2n + ((hn1 + hn) >> 1));
+
+    }
+
+    SETUPHIQ(1, 4); /* HH3 */
+    SETUPLOQ(0, 4); /* LH3 */
+    for (y = 0; y < 8; y++) /* hi */
+    {
+        /* setup */
+        ic = in_buffer + 17 * (9 + y);
+        lo = out_buffer + 8 * 9 + 9 * y; /* LH3 */
+        hi = out_buffer + 8 * 9 + 9 * 8 + 8 * y; /* HH3 */
+
+        /* pre */
+        x2n = ic[0];
+        x2n1 = ic[1];
+        x2n2 = ic[2];
+        hi[0] = HIQ(hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1);
+        lo[0] = LOQ(x2n + hn); /* mirror */
+
+        /* loop */
+        for (n = 1; n < 7; n++)
+        {
+            hn1 = hn;
+            x2n = x2n2;
+            x2n1 = ic[2 * n + 1];
+            x2n2 = ic[2 * n + 2];
+            hi[n] = HIQ(hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1);
+            lo[n] = LOQ(x2n + ((hn1 + hn) >> 1));
+        }
+
+        /* post */
+        hn1 = hn;
+        ic14 = x2n = x2n2;
+        x2n1 = ic[15];
+        x2n2 = ic[16];
+        hi[7] = HIQ(hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1);
+        lo[7] = LOQ(x2n + ((hn1 + hn) >> 1));
+
+        hn1 = hn;
+        x2n = x2n2;
+        /* x2n1 already set, mirror 17 -> 15 */
+        x2n2 = ic14;      /* mirror 18 -> 14 */
+        hn = (x2n1 - ((x2n + x2n2) >> 1)) >> 1;
+        lo[8] = LOQ(x2n + ((hn1 + hn) >> 1));
+    }
+}
+
+/******************************************************************************/
+int
+rfx_rem_dwt_shift_encode(const uint8 *in_buffer, sint16 *out_buffer,
+                         sint16 *tmp_buffer, const char *quants)
+{
+    rfx_rem_dwt_shift_encode_vert_lv1(in_buffer, tmp_buffer);
+    rfx_rem_dwt_shift_encode_horz_lv1(tmp_buffer, out_buffer, quants);
+    rfx_rem_dwt_shift_encode_vert_lv2(out_buffer + 3007, tmp_buffer);
+    rfx_rem_dwt_shift_encode_horz_lv2(tmp_buffer, out_buffer + 3007, quants);
+    rfx_rem_dwt_shift_encode_vert_lv3(out_buffer + 3807, tmp_buffer);
+    rfx_rem_dwt_shift_encode_horz_lv3(tmp_buffer, out_buffer + 3807, quants);
+    return 0;
+}
diff --git a/src/rfxencode_dwt_shift_rem.h b/src/rfxencode_dwt_shift_rem.h
new file mode 100644
index 0000000..f3c7285
--- /dev/null
+++ b/src/rfxencode_dwt_shift_rem.h
@@ -0,0 +1,26 @@
+/**
+ * RFX codec encoder
+ *
+ * Copyright 2020 Jay Sorg <jay.sorg@gmail.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RFXENCODE_DWT_SHIFT_REM_H
+#define __RFXENCODE_DWT_SHIFT_REM_H
+
+int
+rfx_rem_dwt_shift_encode(const uint8 *in_buffer, sint16 *out_buffer,
+                         sint16 *tmp_buffer, const char *quants);
+
+#endif
diff --git a/src/rfxencode_quantization.c b/src/rfxencode_quantization.c
index b10a70e..45b1672 100644
--- a/src/rfxencode_quantization.c
+++ b/src/rfxencode_quantization.c
@@ -137,6 +137,22 @@ rfx_quantization_encode_block(sint16 *buffer, int buffer_size, uint32 factor)
 #endif
 
 /******************************************************************************/
+/* 
+    8 x  8 =   64
+   16 x 16 =  256
+   32 x 32 = 1024
+ 
+   HL1 = 32 x 32 = 1024 (1024)
+   LH1 = 32 x 32 = 1024 (2048)
+   HH1 = 32 x 32 = 1024 (3072)
+   HL2 = 16 x 16 =  256 (3328)
+   LH2 = 16 x 16 =  256 (3584)
+   HH2 = 16 x 16 =  256 (3840)
+   HL3 =  8 x  8 =   64 (3904)
+   LH3 =  8 x  8 =   64 (3968)
+   HH3 =  8 x  8 =   64 (4032)
+   LL3 =  8 x  8 =   64 (4096)
+*/
 int
 rfx_quantization_encode(sint16 *buffer, const char *qtable)
 {
@@ -165,3 +181,52 @@ rfx_quantization_encode(sint16 *buffer, const char *qtable)
     return 0;
 }
 
+/******************************************************************************/
+/* 
+    8 x  8 =   64
+    8 x  9 =   72
+    9 x  9 =   81
+   16 x 16 =  256
+   16 x 17 =  272
+   31 x 31 =  961
+   31 x 33 = 1023
+ 
+   HL1 = 31 x 33 = 1023 (1023)
+   LH1 = 33 x 31 = 1023 (2046)
+   HH1 = 31 x 31 =  961 (3007)
+   HL2 = 16 x 17 =  272 (3279)
+   LH2 = 17 x 16 =  272 (3551)
+   HH2 = 16 x 16 =  256 (3807)
+   HL3 =  8 x  9 =   72 (3879)
+   LH3 =  9 x  8 =   72 (3951)
+   HH3 =  8 x  8 =   64 (4015)
+   LL3 =  9 x  9 =   81 (4096)
+*/
+int
+rfx_rem_quantization_encode(sint16 *buffer, const char *qtable)
+{
+    uint32 factor;
+
+    factor = ((qtable[4] >> 0) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer, 1023, factor); /* HL1 */
+    factor = ((qtable[3] >> 4) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer + 1023, 1023, factor); /* LH1 */
+    factor = ((qtable[4] >> 4) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer + 2046, 961, factor); /* HH1 */
+    factor = ((qtable[2] >> 4) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer + 3007, 272, factor); /* HL2 */
+    factor = ((qtable[2] >> 0) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer + 3279, 272, factor); /* LH2 */
+    factor = ((qtable[3] >> 0) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer + 3551, 256, factor); /* HH2 */
+    factor = ((qtable[1] >> 0) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer + 3807, 72, factor); /* HL3 */
+    factor = ((qtable[0] >> 4) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer + 3879, 72, factor); /* LH3 */
+    factor = ((qtable[1] >> 4) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer + 3951, 64, factor); /* HH3 */
+    factor = ((qtable[0] >> 0) & 0xf) - 6;
+    rfx_quantization_encode_block(buffer + 4015, 81, factor); /* LL3 */
+    return 0;
+}
+
diff --git a/src/rfxencode_quantization.h b/src/rfxencode_quantization.h
index d246889..d08b533 100644
--- a/src/rfxencode_quantization.h
+++ b/src/rfxencode_quantization.h
@@ -24,5 +24,7 @@
 
 int
 rfx_quantization_encode(sint16 *buffer, const char *quantization_values);
+int
+rfx_rem_quantization_encode(sint16 *buffer, const char *quantization_values);
 
 #endif /* __RFX_QUANTIZATION_H */
diff --git a/src/rfxencode_rlgr1.c b/src/rfxencode_rlgr1.c
index be9c8ea..e3dcf88 100644
--- a/src/rfxencode_rlgr1.c
+++ b/src/rfxencode_rlgr1.c
@@ -165,9 +165,13 @@ rfx_rlgr1_encode(const sint16 *data, uint8 *buffer, int buffer_size)
             /* collect the run of zeros in the input stream */
             numZeros = 0;
             GetNextInput(input);
-            while (input == 0 && data_size > 0)
+            while (input == 0)
             {
                 numZeros++;
+                if (data_size < 1)
+                {
+                    break;
+                }
                 GetNextInput(input);
             }
 
@@ -187,15 +191,17 @@ rfx_rlgr1_encode(const sint16 *data, uint8 *buffer, int buffer_size)
             /* output the remaining run length using k bits */
             OutputBits(k, numZeros);
 
-            /* note: when we reach here and the last byte being encoded is 0, we still
-               need to output the last two bits, otherwise mstsc will crash */
+            if (input == 0)
+            {
+                continue;
+            }
 
             /* encode the nonzero value using GR coding */
             mag = (input < 0 ? -input : input); /* absolute value of input coefficient */
             sign = (input < 0 ? 1 : 0);  /* sign of input coefficient */
 
             OutputBit(1, sign); /* output the sign bit */
-            lmag = mag ? mag - 1 : 0;
+            lmag = mag - 1;
             CodeGR(krp, lmag); /* output GR code for (mag - 1) */
 
             UpdateParam(kp, -DN_GR, k);
diff --git a/src/rfxencode_rlgr3.c b/src/rfxencode_rlgr3.c
index e66b38f..5c26985 100644
--- a/src/rfxencode_rlgr3.c
+++ b/src/rfxencode_rlgr3.c
@@ -168,9 +168,13 @@ rfx_rlgr3_encode(const sint16 *data, uint8 *buffer, int buffer_size)
             /* collect the run of zeros in the input stream */
             numZeros = 0;
             GetNextInput(input);
-            while (input == 0 && data_size > 0)
+            while (input == 0)
             {
                 numZeros++;
+                if (data_size < 1)
+                {
+                    break;
+                }
                 GetNextInput(input);
             }
 
@@ -190,15 +194,17 @@ rfx_rlgr3_encode(const sint16 *data, uint8 *buffer, int buffer_size)
             /* output the remaining run length using k bits */
             OutputBits(k, numZeros);
 
-            /* note: when we reach here and the last byte being encoded is 0, we still
-               need to output the last two bits, otherwise mstsc will crash */
+            if (input == 0)
+            {
+                continue;
+            }
 
             /* encode the nonzero value using GR coding */
             mag = (input < 0 ? -input : input); /* absolute value of input coefficient */
             sign = (input < 0 ? 1 : 0);  /* sign of input coefficient */
 
             OutputBit(1, sign); /* output the sign bit */
-            lmag = mag ? mag - 1 : 0;
+            lmag = mag - 1;
             CodeGR(krp, lmag); /* output GR code for (mag - 1) */
 
             UpdateParam(kp, -DN_GR, k);
diff --git a/src/rfxencode_tile.c b/src/rfxencode_tile.c
index 763cab8..a0ec108 100644
--- a/src/rfxencode_tile.c
+++ b/src/rfxencode_tile.c
@@ -33,10 +33,13 @@
 #include "rfxconstants.h"
 #include "rfxencode_tile.h"
 #include "rfxencode_dwt.h"
+#include "rfxencode_dwt_rem.h"
 #include "rfxencode_quantization.h"
 #include "rfxencode_differential.h"
 #include "rfxencode_rlgr1.h"
 #include "rfxencode_rlgr3.h"
+#include "rfxencode_diff_rlgr1.h"
+#include "rfxencode_diff_rlgr3.h"
 #include "rfxencode_alpha.h"
 
 #ifdef RFX_USE_ACCEL_X86
diff --git a/src/x86/rfxencode_tile_x86.c b/src/x86/rfxencode_tile_x86.c
index a84305e..b65e1a9 100644
--- a/src/x86/rfxencode_tile_x86.c
+++ b/src/x86/rfxencode_tile_x86.c
@@ -53,7 +53,7 @@ rfx_encode_component_rlgr1_x86_sse2(struct rfxencode *enc, const char *qtable,
     {
         return 1;
     }
-    *size = rfx_encode_diff_rlgr1(enc->dwt_buffer1, buffer, buffer_size);
+    *size = rfx_encode_diff_rlgr1(enc->dwt_buffer1, buffer, buffer_size, 64);
     return 0;
 }
 
@@ -69,7 +69,7 @@ rfx_encode_component_rlgr3_x86_sse2(struct rfxencode *enc, const char *qtable,
     {
         return 1;
     }
-    *size = rfx_encode_diff_rlgr3(enc->dwt_buffer1, buffer, buffer_size);
+    *size = rfx_encode_diff_rlgr3(enc->dwt_buffer1, buffer, buffer_size, 64);
     return 0;
 }
 
@@ -85,7 +85,7 @@ rfx_encode_component_rlgr1_x86_sse41(struct rfxencode *enc, const char *qtable,
     {
         return 1;
     }
-    *size = rfx_encode_diff_rlgr1(enc->dwt_buffer1, buffer, buffer_size);
+    *size = rfx_encode_diff_rlgr1(enc->dwt_buffer1, buffer, buffer_size, 64);
     return 0;
 }
 
@@ -101,6 +101,6 @@ rfx_encode_component_rlgr3_x86_sse41(struct rfxencode *enc, const char *qtable,
     {
         return 1;
     }
-    *size = rfx_encode_diff_rlgr3(enc->dwt_buffer1, buffer, buffer_size);
+    *size = rfx_encode_diff_rlgr3(enc->dwt_buffer1, buffer, buffer_size, 64);
     return 0;
 }
author	Jay Sorg <jay.sorg@gmail.com>	2020-12-07 11:29:46 +0300
committer	Nexarian <cmp@pitstick.net>	2022-05-09 01:54:06 +0300
commit	34d051363112b16b7df511f08524f69a925939a5 (patch)
tree	e4e9a53f1cee17735cc237101a9bacc74792085a
parent	d8f126abc48a1b949a0be27b334099161c06f0fc (diff)