Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/neutrinolabs/librfxcodec.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIdan Freiberg <speidy@gmail.com>2016-12-20 09:05:56 +0300
committerGitHub <noreply@github.com>2016-12-20 09:05:56 +0300
commit03a36890da75d6c27a31bcd1caae4bb9c839c6f6 (patch)
tree9c12c55f204adb3257c757a8480ae5a48f42f164
parent3f137a65cf94c72c2bea92a58f637454c699c0bd (diff)
parentbc30b7af54d29ca4aae99d6504455bfec012c074 (diff)
Merge pull request #7 from neutrinolabs/develv0.1.0
Merge devel branch (for 0.9.1 version)
-rw-r--r--.gitignore32
-rw-r--r--Makefile12
-rw-r--r--Makefile.am14
-rw-r--r--acinclude.m4137
-rwxr-xr-xbootstrap32
-rw-r--r--configure.ac53
-rw-r--r--include/rfxcodec_common.h40
-rw-r--r--include/rfxcodec_decode.h33
-rw-r--r--include/rfxcodec_encode.h44
-rw-r--r--m4/pkg.m4275
-rw-r--r--rfxcodec-uninstalled.pc.in5
-rw-r--r--rfxcodec.pc.in10
-rw-r--r--src/Makefile47
-rw-r--r--src/Makefile.am57
-rw-r--r--src/amd64/cpuid_amd64.asm13
-rw-r--r--src/amd64/funcs_amd64.h42
-rw-r--r--src/amd64/rfxcodec_encode_diff_rlgr1_amd64_sse2.asm36
-rw-r--r--src/amd64/rfxcodec_encode_diff_rlgr3_amd64_sse2.asm31
-rw-r--r--src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm1503
-rw-r--r--src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm1371
-rw-r--r--src/amd64/rfxdwt_amd64_sse2.asm21
-rw-r--r--src/amd64/rfxrlgr1_amd64.asm21
-rw-r--r--src/amd64/rfxrlgr3_amd64.asm21
-rwxr-xr-xsrc/nasm_lt.sh57
-rw-r--r--src/rfxcommon.h4
-rw-r--r--src/rfxcompose.c267
-rw-r--r--src/rfxcompose.h8
-rw-r--r--src/rfxconstants.h1
-rw-r--r--src/rfxencode.c265
-rw-r--r--src/rfxencode.h23
-rw-r--r--src/rfxencode_alpha.c279
-rw-r--r--src/rfxencode_alpha.h28
-rw-r--r--src/rfxencode_dwt.c27
-rw-r--r--src/rfxencode_dwt.h2
-rw-r--r--src/rfxencode_quantization.c60
-rw-r--r--src/rfxencode_quantization.h2
-rw-r--r--src/rfxencode_rlgr1.c4
-rw-r--r--src/rfxencode_rlgr1.h2
-rw-r--r--src/rfxencode_rlgr3.c4
-rw-r--r--src/rfxencode_rlgr3.h2
-rw-r--r--src/rfxencode_tile.c671
-rw-r--r--src/rfxencode_tile.h71
-rw-r--r--src/x86/cpuid_x86.asm7
-rw-r--r--src/x86/funcs_x86.h43
-rw-r--r--src/x86/readme.txt0
-rw-r--r--src/x86/rfxcodec_encode_diff_rlgr1_x86_sse2.asm35
-rw-r--r--src/x86/rfxcodec_encode_diff_rlgr3_x86_sse2.asm35
-rw-r--r--src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm1533
-rw-r--r--src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm1401
-rw-r--r--src/x86/rfxdwt_x86_sse2.asm25
-rw-r--r--src/x86/rfxrlgr1_x86.asm25
-rw-r--r--src/x86/rfxrlgr3_x86.asm25
-rw-r--r--tests/Makefile22
-rw-r--r--tests/Makefile.am11
-rw-r--r--tests/rfxcodectest.c75
55 files changed, 8316 insertions, 548 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..70869b3
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,32 @@
+*~
+aclocal.m4
+AUTHORS
+autom4te.cache/
+ChangeLog
+config_ac.h
+config_ac-h.in
+config.c
+config.guess
+config.log
+config.status
+config.sub
+configure
+compile
+depcomp
+.deps/
+install-sh
+*.la
+.libs
+libtool
+*.lo
+ltmain.sh
+Makefile
+Makefile.in
+missing
+NEWS
+*.o
+*.pc
+README
+stamp-h1
+rfxcodectest
+.dirstamp
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 0d2e4ce..0000000
--- a/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-
-all: allmake
-
-allmake:
- cd src; $(MAKE) $(MFLAGS)
- cd tests; $(MAKE) $(MFLAGS)
-
-clean: allclean
-
-allclean:
- cd src; $(MAKE) clean
- cd tests; $(MAKE) clean
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 0000000..d0034d0
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,14 @@
+ACLOCAL_AMFLAGS = -I m4
+
+pkgconfig_DATA = rfxcodec.pc
+
+EXTRA_DIST = bootstrap readme.txt
+
+SUBDIRS = \
+ src \
+ tests
+
+include_HEADERS = \
+ include/rfxcodec_encode.h \
+ include/rfxcodec_decode.h \
+ include/rfxcodec_common.h
diff --git a/acinclude.m4 b/acinclude.m4
new file mode 100644
index 0000000..fbfc98d
--- /dev/null
+++ b/acinclude.m4
@@ -0,0 +1,137 @@
+# AC_PROG_NASM
+# --------------------------
+# Check that NASM exists and determine flags
+AC_DEFUN([AC_PROG_NASM],[
+
+AC_CHECK_PROGS(NASM, [nasm nasmw yasm])
+test -z "$NASM" && AC_MSG_ERROR([no nasm (Netwide Assembler) found])
+
+AC_MSG_CHECKING([for object file format of host system])
+case "$host_os" in
+ cygwin* | mingw* | pw32* | interix*)
+ case "$host_cpu" in
+ x86_64)
+ objfmt='Win64-COFF'
+ ;;
+ *)
+ objfmt='Win32-COFF'
+ ;;
+ esac
+ ;;
+ msdosdjgpp* | go32*)
+ objfmt='COFF'
+ ;;
+ os2-emx*) # not tested
+ objfmt='MSOMF' # obj
+ ;;
+ linux*coff* | linux*oldld*)
+ objfmt='COFF' # ???
+ ;;
+ linux*aout*)
+ objfmt='a.out'
+ ;;
+ linux*)
+ case "$host_cpu" in
+ x86_64)
+ objfmt='ELF64'
+ ;;
+ *)
+ objfmt='ELF'
+ ;;
+ esac
+ ;;
+ freebsd* | netbsd* | openbsd*)
+ if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
+ objfmt='BSD-a.out'
+ else
+ case "$host_cpu" in
+ x86_64 | amd64)
+ objfmt='ELF64'
+ ;;
+ *)
+ objfmt='ELF'
+ ;;
+ esac
+ fi
+ ;;
+ solaris* | sunos* | sysv* | sco*)
+ case "$host_cpu" in
+ x86_64)
+ objfmt='ELF64'
+ ;;
+ *)
+ objfmt='ELF'
+ ;;
+ esac
+ ;;
+ darwin* | rhapsody* | nextstep* | openstep* | macos*)
+ case "$host_cpu" in
+ x86_64)
+ objfmt='Mach-O64'
+ ;;
+ *)
+ objfmt='Mach-O'
+ ;;
+ esac
+ ;;
+ *)
+ objfmt='ELF ?'
+ ;;
+esac
+
+AC_MSG_RESULT([$objfmt])
+if test "$objfmt" = 'ELF ?'; then
+ objfmt='ELF'
+ AC_MSG_WARN([unexpected host system. assumed that the format is $objfmt.])
+fi
+
+AC_MSG_CHECKING([for object file format specifier (NAFLAGS) ])
+case "$objfmt" in
+ MSOMF) NAFLAGS='-fobj -DOBJ32';;
+ Win32-COFF) NAFLAGS='-fwin32 -DWIN32';;
+ Win64-COFF) NAFLAGS='-fwin64 -DWIN64 -D__x86_64__';;
+ COFF) NAFLAGS='-fcoff -DCOFF';;
+ a.out) NAFLAGS='-faout -DAOUT';;
+ BSD-a.out) NAFLAGS='-faoutb -DAOUT';;
+ ELF) NAFLAGS='-felf -DELF';;
+ ELF64) NAFLAGS='-felf64 -DELF -D__x86_64__';;
+ RDF) NAFLAGS='-frdf -DRDF';;
+ Mach-O) NAFLAGS='-fmacho -DMACHO';;
+ Mach-O64) NAFLAGS='-fmacho64 -DMACHO -D__x86_64__';;
+esac
+AC_MSG_RESULT([$NAFLAGS])
+AC_SUBST([NAFLAGS])
+
+AC_MSG_CHECKING([whether the assembler ($NASM $NAFLAGS) works])
+cat > conftest.asm <<EOF
+[%line __oline__ "configure"
+ section .text
+ global _main,main
+_main:
+main: xor eax,eax
+ ret
+]EOF
+try_nasm='$NASM $NAFLAGS -o conftest.o conftest.asm'
+if AC_TRY_EVAL(try_nasm) && test -s conftest.o; then
+ AC_MSG_RESULT(yes)
+else
+ echo "configure: failed program was:" >&AC_FD_CC
+ cat conftest.asm >&AC_FD_CC
+ rm -rf conftest*
+ AC_MSG_RESULT(no)
+ AC_MSG_ERROR([installation or configuration problem: assembler cannot create object files.])
+fi
+
+AC_MSG_CHECKING([whether the linker accepts assembler output])
+try_nasm='${CC-cc} -o conftest${ac_exeext} $LDFLAGS conftest.o $LIBS 1>&AC_FD_CC'
+if AC_TRY_EVAL(try_nasm) && test -s conftest${ac_exeext}; then
+ rm -rf conftest*
+ AC_MSG_RESULT(yes)
+else
+ rm -rf conftest*
+ AC_MSG_RESULT(no)
+ AC_MSG_ERROR([configuration problem: maybe object file format mismatch.])
+fi
+
+])
+
diff --git a/bootstrap b/bootstrap
new file mode 100755
index 0000000..a5ef9dd
--- /dev/null
+++ b/bootstrap
@@ -0,0 +1,32 @@
+#!/bin/sh
+
+which autoconf
+if ! test $? -eq 0
+then
+ echo "error, install autoconf"
+ exit 1
+fi
+
+which automake
+if ! test $? -eq 0
+then
+ echo "error, install automake"
+ exit 1
+fi
+
+which libtool || which libtoolize
+if ! test $? -eq 0
+then
+ echo "error, install libtool"
+ exit 1
+fi
+
+which pkg-config
+if ! test $? -eq 0
+then
+ echo "error, install pkg-config"
+ exit 1
+fi
+
+touch configure.ac
+autoreconf -fvi
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 0000000..e5bc1d8
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,53 @@
+# Process this file with autoconf to produce a configure script
+
+AC_PREREQ(2.59)
+AC_INIT([rfxcodec], [0.1.0], [jay.sorg@gmail.com])
+AC_CONFIG_HEADERS(config_ac.h:config_ac-h.in)
+AM_INIT_AUTOMAKE([1.6 foreign])
+AC_CONFIG_MACRO_DIR([m4])
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES])
+AC_PROG_CC
+AC_C_CONST
+AC_PROG_LIBTOOL
+PKG_INSTALLDIR
+
+# SIMD is optional
+AC_ARG_WITH([simd],
+ AC_HELP_STRING([--without-simd],[Omit SIMD extensions.]))
+if test "x${with_simd}" != "xno"; then
+ # Check if we're on a supported CPU
+ AC_MSG_CHECKING([if we have SIMD optimisations for cpu type])
+ case "$host_cpu" in
+ x86_64 | amd64)
+ AC_MSG_RESULT([yes (x86_64)])
+ AC_PROG_NASM
+ simd_arch=x86_64
+ ;;
+ i*86 | x86 | ia32)
+ AC_MSG_RESULT([yes (i386)])
+ AC_PROG_NASM
+ simd_arch=i386
+ ;;
+ *)
+ AC_MSG_RESULT([no ("$host_cpu")])
+ AC_MSG_WARN([SIMD support not available for this CPU. Performance will suffer.])
+ with_simd=no;
+ ;;
+ esac
+ if test "x${with_simd}" != "xno"; then
+ AC_DEFINE([WITH_SIMD], [1], [Use accelerated SIMD routines.])
+ fi
+fi
+
+AM_CONDITIONAL(WITH_SIMD_AMD64, [test x$simd_arch = xx86_64])
+AM_CONDITIONAL(WITH_SIMD_X86, [test x$simd_arch = xi386])
+
+AC_CONFIG_FILES([Makefile
+ src/Makefile
+ tests/Makefile
+ rfxcodec.pc
+ rfxcodec-uninstalled.pc
+])
+
+AC_OUTPUT
+
diff --git a/include/rfxcodec_common.h b/include/rfxcodec_common.h
new file mode 100644
index 0000000..0411c73
--- /dev/null
+++ b/include/rfxcodec_common.h
@@ -0,0 +1,40 @@
+/**
+ * RFX codec
+ *
+ * Copyright 2015 Jay Sorg <jay.sorg@gmail.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RFXCODEC_COMMON_H
+#define __RFXCODEC_COMMON_H
+
+#define RFX_FORMAT_BGRA 0
+#define RFX_FORMAT_RGBA 1
+#define RFX_FORMAT_BGR 2
+#define RFX_FORMAT_RGB 3
+#define RFX_FORMAT_YUV 4 /* YUV444 linear tiled mode */
+
+#define RFX_FLAGS_NONE 0 /* default RFX_FLAGS_RLGR3 and RFX_FLAGS_SAFE */
+
+#define RFX_FLAGS_SAFE 0 /* default */
+#define RFX_FLAGS_OPT1 (1 << 3)
+#define RFX_FLAGS_OPT2 (1 << 4)
+#define RFX_FLAGS_NOACCEL (1 << 6)
+
+#define RFX_FLAGS_RLGR3 0 /* default */
+#define RFX_FLAGS_RLGR1 1
+
+#define RFX_FLAGS_ALPHAV1 1 /* used in flags for rfxcodec_encode */
+
+#endif
diff --git a/include/rfxcodec_decode.h b/include/rfxcodec_decode.h
new file mode 100644
index 0000000..ed4b0a8
--- /dev/null
+++ b/include/rfxcodec_decode.h
@@ -0,0 +1,33 @@
+/**
+ * RFX codec decoder
+ *
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RFXCODEC_DECODE_H
+#define __RFXCODEC_DECODE_H
+
+#include <rfxcodec_common.h>
+
+int
+rfxcodec_decode_create(int width, int height, int format, int flags,
+ void **handle);
+int
+rfxcodec_decode_destroy(void *handle);
+int
+rfxcodec_decode(void *handle, char *cdata, int cdata_bytes,
+ char *data, int width, int height, int stride_bytes);
+
+#endif
diff --git a/include/rfxcodec_encode.h b/include/rfxcodec_encode.h
index f082dd2..04112b0 100644
--- a/include/rfxcodec_encode.h
+++ b/include/rfxcodec_encode.h
@@ -1,7 +1,7 @@
/**
* RFX codec encoder
*
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -19,23 +19,7 @@
#ifndef __RFXCODEC_ENCODE_H
#define __RFXCODEC_ENCODE_H
-#define RFX_USE_ACCEL 0
-
-#define RFX_FORMAT_BGRA 0
-#define RFX_FORMAT_RGBA 1
-#define RFX_FORMAT_BGR 2
-#define RFX_FORMAT_RGB 3
-#define RFX_FORMAT_YUV 4 /* YUV444 linear tiled mode */
-
-#define RFX_FLAGS_NONE 0 /* default RFX_FLAGS_RLGR3 and RFX_FLAGS_SAFE */
-
-#define RFX_FLAGS_RLGR3 0 /* default */
-#define RFX_FLAGS_RLGR1 1
-
-#define RFX_FLAGS_SAFE 0 /* default */
-#define RFX_FLAGS_OPT1 (1 << 3)
-#define RFX_FLAGS_OPT2 (1 << 4)
-#define RFX_FLAGS_NOACCEL (1 << 6)
+#include <rfxcodec_common.h>
struct rfx_rect
{
@@ -49,8 +33,8 @@ struct rfx_tile
{
int x; /* multiple of 64 */
int y; /* multiple of 64 */
- int cx; /* must be 64 */
- int cy; /* must be 64 */
+ int cx; /* must be 64 or less */
+ int cy; /* must be 64 or less */
int quant_y;
int quant_cb;
int quant_cr;
@@ -59,8 +43,12 @@ struct rfx_tile
void *
rfxcodec_encode_create(int width, int height, int format, int flags);
int
-rfxcodec_encode_destroy(void * handle);
-/* quants, 10 ints per set, should be num_quants * 10 ints in quants)
+rfxcodec_encode_create_ex(int width, int height, int format, int flags,
+ void **handle);
+int
+rfxcodec_encode_destroy(void *handle);
+/* quants, 5 ints per set, should be num_quants * 5 chars in quants)
+ * each char is 2 quant values
* quantizer order is
* 0 - LL3
* 1 - LH3
@@ -75,8 +63,14 @@ rfxcodec_encode_destroy(void * handle);
int
rfxcodec_encode(void *handle, char *cdata, int *cdata_bytes,
char *buf, int width, int height, int stride_bytes,
- struct rfx_rect *region, int num_region,
- struct rfx_tile *tiles, int num_tiles,
- const int *quants, int num_quants);
+ const struct rfx_rect *region, int num_region,
+ const struct rfx_tile *tiles, int num_tiles,
+ const char *quants, int num_quants);
+int
+rfxcodec_encode_ex(void *handle, char *cdata, int *cdata_bytes,
+ char *buf, int width, int height, int stride_bytes,
+ const struct rfx_rect *region, int num_region,
+ const struct rfx_tile *tiles, int num_tiles,
+ const char *quants, int num_quants, int flags);
#endif
diff --git a/m4/pkg.m4 b/m4/pkg.m4
new file mode 100644
index 0000000..82bea96
--- /dev/null
+++ b/m4/pkg.m4
@@ -0,0 +1,275 @@
+dnl pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*-
+dnl serial 11 (pkg-config-0.29.1)
+dnl
+dnl Copyright © 2004 Scott James Remnant <scott@netsplit.com>.
+dnl Copyright © 2012-2015 Dan Nicholson <dbn.lists@gmail.com>
+dnl
+dnl This program is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU General Public License as published by
+dnl the Free Software Foundation; either version 2 of the License, or
+dnl (at your option) any later version.
+dnl
+dnl This program is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU General Public License
+dnl along with this program; if not, write to the Free Software
+dnl Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+dnl 02111-1307, USA.
+dnl
+dnl As a special exception to the GNU General Public License, if you
+dnl distribute this file as part of a program that contains a
+dnl configuration script generated by Autoconf, you may include it under
+dnl the same distribution terms that you use for the rest of that
+dnl program.
+
+dnl PKG_PREREQ(MIN-VERSION)
+dnl -----------------------
+dnl Since: 0.29
+dnl
+dnl Verify that the version of the pkg-config macros are at least
+dnl MIN-VERSION. Unlike PKG_PROG_PKG_CONFIG, which checks the user's
+dnl installed version of pkg-config, this checks the developer's version
+dnl of pkg.m4 when generating configure.
+dnl
+dnl To ensure that this macro is defined, also add:
+dnl m4_ifndef([PKG_PREREQ],
+dnl [m4_fatal([must install pkg-config 0.29 or later before running autoconf/autogen])])
+dnl
+dnl See the "Since" comment for each macro you use to see what version
+dnl of the macros you require.
+m4_defun([PKG_PREREQ],
+[m4_define([PKG_MACROS_VERSION], [0.29.1])
+m4_if(m4_version_compare(PKG_MACROS_VERSION, [$1]), -1,
+ [m4_fatal([pkg.m4 version $1 or higher is required but ]PKG_MACROS_VERSION[ found])])
+])dnl PKG_PREREQ
+
+dnl PKG_PROG_PKG_CONFIG([MIN-VERSION])
+dnl ----------------------------------
+dnl Since: 0.16
+dnl
+dnl Search for the pkg-config tool and set the PKG_CONFIG variable to
+dnl first found in the path. Checks that the version of pkg-config found
+dnl is at least MIN-VERSION. If MIN-VERSION is not specified, 0.9.0 is
+dnl used since that's the first version where most current features of
+dnl pkg-config existed.
+AC_DEFUN([PKG_PROG_PKG_CONFIG],
+[m4_pattern_forbid([^_?PKG_[A-Z_]+$])
+m4_pattern_allow([^PKG_CONFIG(_(PATH|LIBDIR|SYSROOT_DIR|ALLOW_SYSTEM_(CFLAGS|LIBS)))?$])
+m4_pattern_allow([^PKG_CONFIG_(DISABLE_UNINSTALLED|TOP_BUILD_DIR|DEBUG_SPEW)$])
+AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility])
+AC_ARG_VAR([PKG_CONFIG_PATH], [directories to add to pkg-config's search path])
+AC_ARG_VAR([PKG_CONFIG_LIBDIR], [path overriding pkg-config's built-in search path])
+
+if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then
+ AC_PATH_TOOL([PKG_CONFIG], [pkg-config])
+fi
+if test -n "$PKG_CONFIG"; then
+ _pkg_min_version=m4_default([$1], [0.9.0])
+ AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version])
+ if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then
+ AC_MSG_RESULT([yes])
+ else
+ AC_MSG_RESULT([no])
+ PKG_CONFIG=""
+ fi
+fi[]dnl
+])dnl PKG_PROG_PKG_CONFIG
+
+dnl PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+dnl -------------------------------------------------------------------
+dnl Since: 0.18
+dnl
+dnl Check to see whether a particular set of modules exists. Similar to
+dnl PKG_CHECK_MODULES(), but does not set variables or print errors.
+dnl
+dnl Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG])
+dnl only at the first occurence in configure.ac, so if the first place
+dnl it's called might be skipped (such as if it is within an "if", you
+dnl have to call PKG_CHECK_EXISTS manually
+AC_DEFUN([PKG_CHECK_EXISTS],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+if test -n "$PKG_CONFIG" && \
+ AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then
+ m4_default([$2], [:])
+m4_ifvaln([$3], [else
+ $3])dnl
+fi])
+
+dnl _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES])
+dnl ---------------------------------------------
+dnl Internal wrapper calling pkg-config via PKG_CONFIG and setting
+dnl pkg_failed based on the result.
+m4_define([_PKG_CONFIG],
+[if test -n "$$1"; then
+ pkg_cv_[]$1="$$1"
+ elif test -n "$PKG_CONFIG"; then
+ PKG_CHECK_EXISTS([$3],
+ [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null`
+ test "x$?" != "x0" && pkg_failed=yes ],
+ [pkg_failed=yes])
+ else
+ pkg_failed=untried
+fi[]dnl
+])dnl _PKG_CONFIG
+
+dnl _PKG_SHORT_ERRORS_SUPPORTED
+dnl ---------------------------
+dnl Internal check to see if pkg-config supports short errors.
+AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+ _pkg_short_errors_supported=yes
+else
+ _pkg_short_errors_supported=no
+fi[]dnl
+])dnl _PKG_SHORT_ERRORS_SUPPORTED
+
+
+dnl PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
+dnl [ACTION-IF-NOT-FOUND])
+dnl --------------------------------------------------------------
+dnl Since: 0.4.0
+dnl
+dnl Note that if there is a possibility the first call to
+dnl PKG_CHECK_MODULES might not happen, you should be sure to include an
+dnl explicit call to PKG_PROG_PKG_CONFIG in your configure.ac
+AC_DEFUN([PKG_CHECK_MODULES],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl
+AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl
+
+pkg_failed=no
+AC_MSG_CHECKING([for $1])
+
+_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2])
+_PKG_CONFIG([$1][_LIBS], [libs], [$2])
+
+m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS
+and $1[]_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.])
+
+if test $pkg_failed = yes; then
+ AC_MSG_RESULT([no])
+ _PKG_SHORT_ERRORS_SUPPORTED
+ if test $_pkg_short_errors_supported = yes; then
+ $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "$2" 2>&1`
+ else
+ $1[]_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "$2" 2>&1`
+ fi
+ # Put the nasty error message in config.log where it belongs
+ echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD
+
+ m4_default([$4], [AC_MSG_ERROR(
+[Package requirements ($2) were not met:
+
+$$1_PKG_ERRORS
+
+Consider adjusting the PKG_CONFIG_PATH environment variable if you
+installed software in a non-standard prefix.
+
+_PKG_TEXT])[]dnl
+ ])
+elif test $pkg_failed = untried; then
+ AC_MSG_RESULT([no])
+ m4_default([$4], [AC_MSG_FAILURE(
+[The pkg-config script could not be found or is too old. Make sure it
+is in your PATH or set the PKG_CONFIG environment variable to the full
+path to pkg-config.
+
+_PKG_TEXT
+
+To get pkg-config, see <http://pkg-config.freedesktop.org/>.])[]dnl
+ ])
+else
+ $1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS
+ $1[]_LIBS=$pkg_cv_[]$1[]_LIBS
+ AC_MSG_RESULT([yes])
+ $3
+fi[]dnl
+])dnl PKG_CHECK_MODULES
+
+
+dnl PKG_CHECK_MODULES_STATIC(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
+dnl [ACTION-IF-NOT-FOUND])
+dnl ---------------------------------------------------------------------
+dnl Since: 0.29
+dnl
+dnl Checks for existence of MODULES and gathers its build flags with
+dnl static libraries enabled. Sets VARIABLE-PREFIX_CFLAGS from --cflags
+dnl and VARIABLE-PREFIX_LIBS from --libs.
+dnl
+dnl Note that if there is a possibility the first call to
+dnl PKG_CHECK_MODULES_STATIC might not happen, you should be sure to
+dnl include an explicit call to PKG_PROG_PKG_CONFIG in your
+dnl configure.ac.
+AC_DEFUN([PKG_CHECK_MODULES_STATIC],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+_save_PKG_CONFIG=$PKG_CONFIG
+PKG_CONFIG="$PKG_CONFIG --static"
+PKG_CHECK_MODULES($@)
+PKG_CONFIG=$_save_PKG_CONFIG[]dnl
+])dnl PKG_CHECK_MODULES_STATIC
+
+
+dnl PKG_INSTALLDIR([DIRECTORY])
+dnl -------------------------
+dnl Since: 0.27
+dnl
+dnl Substitutes the variable pkgconfigdir as the location where a module
+dnl should install pkg-config .pc files. By default the directory is
+dnl $libdir/pkgconfig, but the default can be changed by passing
+dnl DIRECTORY. The user can override through the --with-pkgconfigdir
+dnl parameter.
+AC_DEFUN([PKG_INSTALLDIR],
+[m4_pushdef([pkg_default], [m4_default([$1], ['${libdir}/pkgconfig'])])
+m4_pushdef([pkg_description],
+ [pkg-config installation directory @<:@]pkg_default[@:>@])
+AC_ARG_WITH([pkgconfigdir],
+ [AS_HELP_STRING([--with-pkgconfigdir], pkg_description)],,
+ [with_pkgconfigdir=]pkg_default)
+AC_SUBST([pkgconfigdir], [$with_pkgconfigdir])
+m4_popdef([pkg_default])
+m4_popdef([pkg_description])
+])dnl PKG_INSTALLDIR
+
+
+dnl PKG_NOARCH_INSTALLDIR([DIRECTORY])
+dnl --------------------------------
+dnl Since: 0.27
+dnl
+dnl Substitutes the variable noarch_pkgconfigdir as the location where a
+dnl module should install arch-independent pkg-config .pc files. By
+dnl default the directory is $datadir/pkgconfig, but the default can be
+dnl changed by passing DIRECTORY. The user can override through the
+dnl --with-noarch-pkgconfigdir parameter.
+AC_DEFUN([PKG_NOARCH_INSTALLDIR],
+[m4_pushdef([pkg_default], [m4_default([$1], ['${datadir}/pkgconfig'])])
+m4_pushdef([pkg_description],
+ [pkg-config arch-independent installation directory @<:@]pkg_default[@:>@])
+AC_ARG_WITH([noarch-pkgconfigdir],
+ [AS_HELP_STRING([--with-noarch-pkgconfigdir], pkg_description)],,
+ [with_noarch_pkgconfigdir=]pkg_default)
+AC_SUBST([noarch_pkgconfigdir], [$with_noarch_pkgconfigdir])
+m4_popdef([pkg_default])
+m4_popdef([pkg_description])
+])dnl PKG_NOARCH_INSTALLDIR
+
+
+dnl PKG_CHECK_VAR(VARIABLE, MODULE, CONFIG-VARIABLE,
+dnl [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+dnl -------------------------------------------
+dnl Since: 0.28
+dnl
+dnl Retrieves the value of the pkg-config variable for the given module.
+AC_DEFUN([PKG_CHECK_VAR],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+AC_ARG_VAR([$1], [value of $3 for $2, overriding pkg-config])dnl
+
+_PKG_CONFIG([$1], [variable="][$3]["], [$2])
+AS_VAR_COPY([$1], [pkg_cv_][$1])
+
+AS_VAR_IF([$1], [""], [$5], [$4])dnl
+])dnl PKG_CHECK_VAR
diff --git a/rfxcodec-uninstalled.pc.in b/rfxcodec-uninstalled.pc.in
new file mode 100644
index 0000000..9aed766
--- /dev/null
+++ b/rfxcodec-uninstalled.pc.in
@@ -0,0 +1,5 @@
+Name: rfxcodec
+Description: Fast jpeg2000 codec compatible with MS RDP servers and xrdp
+Version: @PACKAGE_VERSION@
+Cflags: -I${pc_top_builddir}/${pcfiledir}/include
+Libs: ${pc_top_builddir}/${pcfiledir}/src/librfxencode.la
diff --git a/rfxcodec.pc.in b/rfxcodec.pc.in
new file mode 100644
index 0000000..8bd611c
--- /dev/null
+++ b/rfxcodec.pc.in
@@ -0,0 +1,10 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: rfxcodec
+Description: Fast jpeg2000 codec compatible with MS RDP servers and xrdp
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}
+Libs: -L${libdir} -lrfxencode
diff --git a/src/Makefile b/src/Makefile
deleted file mode 100644
index 4c657c7..0000000
--- a/src/Makefile
+++ /dev/null
@@ -1,47 +0,0 @@
-
-OBJS = rfxencode.o rfxcompose.o rfxencode_tile.o rfxencode_dwt.o \
-rfxencode_quantization.o rfxencode_differential.o \
-rfxencode_rlgr1.o rfxencode_rlgr3.o
-
-#OBJS += cpuid_x86.o rfxrlgr1_x86.o rfxrlgr3_x86.o rfxdwt_x86_sse2.o
-#OBJS += cpuid_amd64.o rfxrlgr1_amd64.o rfxrlgr3_amd64.o rfxdwt_amd64_sse2.o
-
-CFLAGS = $(PROFIL) -g -O2 -Wall -fPIC -I../include
-#-DRFX_USE_ACCEL
-
-LDFLAGS =
-
-LIBS =
-
-all: librfxencode.so
-
-librfxencode.so: $(OBJS) Makefile
- $(CC) -shared -o librfxencode.so $(LDFLAGS) $(OBJS) $(LIBS)
- $(AR) -rv librfxencode.a $(OBJS)
-
-cpuid_x86.o: x86/cpuid_x86.asm
- yasm -f elf32 -g dwarf2 x86/cpuid_x86.asm
-
-rfxrlgr1_x86.o: x86/rfxrlgr1_x86.asm
- yasm -f elf32 -g dwarf2 x86/rfxrlgr1_x86.asm
-
-rfxrlgr3_x86.o: x86/rfxrlgr3_x86.asm
- yasm -f elf32 -g dwarf2 x86/rfxrlgr3_x86.asm
-
-rfxdwt_x86_sse2.o: x86/rfxdwt_x86_sse2.asm
- yasm -f elf32 -g dwarf2 x86/rfxdwt_x86_sse2.asm
-
-cpuid_amd64.o: amd64/cpuid_amd64.asm
- yasm -f elf64 -g dwarf2 amd64/cpuid_amd64.asm
-
-rfxrlgr1_amd64.o: amd64/rfxrlgr1_amd64.asm
- yasm -f elf64 -g dwarf2 amd64/rfxrlgr1_amd64.asm
-
-rfxrlgr3_amd64.o: amd64/rfxrlgr3_amd64.asm
- yasm -f elf64 -g dwarf2 amd64/rfxrlgr3_amd64.asm
-
-rfxdwt_amd64_sse2.o: amd64/rfxdwt_amd64_sse2.asm
- yasm -f elf64 -g dwarf2 amd64/rfxdwt_amd64_sse2.asm
-
-clean:
- rm -f $(OBJS) librfxencode.so librfxencode.a
diff --git a/src/Makefile.am b/src/Makefile.am
new file mode 100644
index 0000000..54eb6fe
--- /dev/null
+++ b/src/Makefile.am
@@ -0,0 +1,57 @@
+EXTRA_DIST = $(AMD64_ASM) $(X86_ASM) nasm_lt.sh
+
+AMD64_ASM = \
+ amd64/cpuid_amd64.asm \
+ amd64/rfxcodec_encode_diff_rlgr1_amd64_sse2.asm \
+ amd64/rfxcodec_encode_diff_rlgr3_amd64_sse2.asm \
+ amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm \
+ amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm
+
+X86_ASM = \
+ x86/cpuid_x86.asm \
+ x86/rfxcodec_encode_diff_rlgr1_x86_sse2.asm \
+ x86/rfxcodec_encode_diff_rlgr3_x86_sse2.asm \
+ x86/rfxcodec_encode_dwt_shift_x86_sse2.asm \
+ x86/rfxcodec_encode_dwt_shift_x86_sse41.asm
+
+ASM_SOURCES =
+
+AM_CPPFLAGS = \
+ -I$(top_srcdir)/include \
+ -I../include
+
+if WITH_SIMD_AMD64
+ASM_SOURCES += $(AMD64_ASM)
+AM_CPPFLAGS += -DSIMD_USE_ACCEL=1 -DRFX_USE_ACCEL_AMD64=1
+endif
+
+if WITH_SIMD_X86
+ASM_SOURCES += $(X86_ASM)
+AM_CPPFLAGS += -DSIMD_USE_ACCEL=1 -DRFX_USE_ACCEL_X86=1
+endif
+
+noinst_HEADERS = \
+ rfx_bitstream.h \
+ rfxcommon.h \
+ rfxcompose.h \
+ rfxconstants.h \
+ rfxencode_alpha.h \
+ rfxencode_differential.h \
+ rfxencode_dwt.h \
+ rfxencode.h \
+ rfxencode_quantization.h \
+ rfxencode_rlgr1.h \
+ rfxencode_rlgr3.h \
+ rfxencode_tile.h \
+ amd64/funcs_amd64.h \
+ x86/funcs_x86.h
+
+lib_LTLIBRARIES = librfxencode.la
+
+librfxencode_la_SOURCES = $(noinst_HEADERS) rfxencode.c \
+ rfxcompose.c rfxencode_tile.c rfxencode_dwt.c \
+ rfxencode_quantization.c rfxencode_differential.c \
+ rfxencode_rlgr1.c rfxencode_rlgr3.c rfxencode_alpha.c $(ASM_SOURCES)
+
+.asm.lo:
+ $(LIBTOOL) --mode=compile $(srcdir)/nasm_lt.sh $(NASM) $(NAFLAGS) -I$(srcdir) -I. $< -o $@
diff --git a/src/amd64/cpuid_amd64.asm b/src/amd64/cpuid_amd64.asm
index b97937a..e561b2d 100644
--- a/src/amd64/cpuid_amd64.asm
+++ b/src/amd64/cpuid_amd64.asm
@@ -1,3 +1,6 @@
+%ifidn __OUTPUT_FORMAT__,elf64
+SECTION .note.GNU-stack noalloc noexec nowrite progbits
+%endif
SECTION .text
@@ -13,10 +16,14 @@ SECTION .text
;int
;cpuid_amd64(int eax_in, int ecx_in, int *eax, int *ebx, int *ecx, int *edx)
+%ifidn __OUTPUT_FORMAT__,elf64
PROC cpuid_amd64
+%else
+PROC _cpuid_amd64
+%endif
; save registers
push rbx
-
+
push rdx
push rcx
push r8
@@ -33,9 +40,9 @@ PROC cpuid_amd64
mov [rdi], ebx
pop rdi
mov [rdi], eax
- mov eax, 0
+ mov rax, 0
; restore registers
pop rbx
- ret;
+ ret
align 16
diff --git a/src/amd64/funcs_amd64.h b/src/amd64/funcs_amd64.h
index 02cf6c8..124f838 100644
--- a/src/amd64/funcs_amd64.h
+++ b/src/amd64/funcs_amd64.h
@@ -1,5 +1,5 @@
/*
-Copyright 2014 Jay Sorg
+Copyright 2014-2015 Jay Sorg
Permission to use, copy, modify, distribute, and sell this software and its
documentation for any purpose is hereby granted without fee, provided that
@@ -24,12 +24,48 @@ amd64 asm files
#ifndef __FUNCS_AMD64_H
#define __FUNCS_AMD64_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
int
cpuid_amd64(int eax_in, int ecx_in, int *eax, int *ebx, int *ecx, int *edx);
+
+int
+rfxcodec_encode_dwt_shift_amd64_sse2(const char *qtable,
+ unsigned char *data,
+ short *dwt_buffer1,
+ short *dwt_buffer);
+int
+rfxcodec_encode_dwt_shift_amd64_sse41(const char *qtable,
+ unsigned char *data,
+ short *dwt_buffer1,
+ short *dwt_buffer);
+int
+rfxcodec_encode_diff_rlgr1_amd64_sse2(short *co,
+ void *dst, int dst_bytes);
int
-dwt_shift_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs);
+rfxcodec_encode_diff_rlgr3_amd64_sse2(short *co,
+ void *dst, int dst_bytes);
+
+int
+rfxcodec_decode_rlgr1_diff_amd64_sse2(void *data, int data_bytes,
+ short *out_data);
+int
+rfxcodec_decode_rlgr3_diff_amd64_sse2(void *data, int data_bytes,
+ short *out_data);
int
-diff_rlgr3_amd64(sint16 *co, int num_co, uint8 *dst, int dst_bytes);
+rfxcodec_decode_shift_idwt_amd64_sse2(char *qtable, short *src, short *dst);
+int
+rfxcodec_decode_yuv2rgb_amd64_sse2(short *ydata, short *udata, short *vdata,
+ unsigned int *rgbdata, int stride);
+int
+rfxcodec_decode_yuva2argb_amd64_sse2(short *ydata, short *udata,
+ short *vdata, char *adata,
+ unsigned int *rgbdata, int stride);
+#ifdef __cplusplus
+}
#endif
+#endif
diff --git a/src/amd64/rfxcodec_encode_diff_rlgr1_amd64_sse2.asm b/src/amd64/rfxcodec_encode_diff_rlgr1_amd64_sse2.asm
new file mode 100644
index 0000000..b2de84f
--- /dev/null
+++ b/src/amd64/rfxcodec_encode_diff_rlgr1_amd64_sse2.asm
@@ -0,0 +1,36 @@
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+section .data
+ const1 times 8 dw 1
+
+section .text
+
+%macro PROC 1
+ align 16
+ global %1
+ %1:
+%endmacro
+
+;The first six integer or pointer arguments are passed in registers
+;RDI, RSI, RDX, RCX, R8, and R9
+
+;int
+;rfxcodec_encode_diff_rlgr1_amd64_sse2(short *co,
+; void *dst, int dst_bytes);
+
+%ifidn __OUTPUT_FORMAT__,elf64
+PROC rfxcodec_encode_diff_rlgr1_amd64_sse2
+%else
+PROC _rfxcodec_encode_diff_rlgr1_amd64_sse2
+%endif
+ ; save registers
+ push rbx
+
+ mov rax, 0
+ ; restore registers
+ pop rbx
+ ret
+ align 16
+
diff --git a/src/amd64/rfxcodec_encode_diff_rlgr3_amd64_sse2.asm b/src/amd64/rfxcodec_encode_diff_rlgr3_amd64_sse2.asm
new file mode 100644
index 0000000..f5712be
--- /dev/null
+++ b/src/amd64/rfxcodec_encode_diff_rlgr3_amd64_sse2.asm
@@ -0,0 +1,31 @@
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+section .data
+ const1 times 8 dw 1
+
+section .text
+
+%macro PROC 1
+ align 16
+ global %1
+ %1:
+%endmacro
+
+;int
+;rfxcodec_encode_diff_rlgr3_amd64_sse2(short *co,
+; void *dst, int dst_bytes);
+
+%ifidn __OUTPUT_FORMAT__,elf64
+PROC rfxcodec_encode_diff_rlgr3_amd64_sse2
+%else
+PROC _rfxcodec_encode_diff_rlgr3_amd64_sse2
+%endif
+ ; save registers
+ push rbx
+ mov rax, 0
+ pop rbx
+ ret
+ align 16
+
diff --git a/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm
new file mode 100644
index 0000000..ee97588
--- /dev/null
+++ b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm
@@ -0,0 +1,1503 @@
+;
+;Copyright 2016 Jay Sorg
+;
+;Permission to use, copy, modify, distribute, and sell this software and its
+;documentation for any purpose is hereby granted without fee, provided that
+;the above copyright notice appear in all copies and that both that
+;copyright notice and this permission notice appear in supporting
+;documentation.
+;
+;The above copyright notice and this permission notice shall be included in
+;all copies or substantial portions of the Software.
+;
+;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+;
+;amd64 asm dwt
+
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+section .data
+ align 16
+ cw128 times 8 dw 128
+ cdFFFF times 4 dd 65535
+ ; these are 1 << (factor - 1) 0 to 15 is factor
+ cwa0 times 8 dw 0 ; 0
+ cwa1 times 8 dw 1 ; 1
+ cwa2 times 8 dw 2 ; 2
+ cwa4 times 8 dw 4 ; 3
+ cwa8 times 8 dw 8 ; 4
+ cwa16 times 8 dw 16 ; 5
+ cwa32 times 8 dw 32 ; 6
+ cwa64 times 8 dw 64 ; 7
+ cwa128 times 8 dw 128 ; 8
+ cwa256 times 8 dw 256 ; 9
+ cwa512 times 8 dw 512 ; 10
+ cwa1024 times 8 dw 1024 ; 11
+ cwa2048 times 8 dw 2048 ; 12
+ cwa4096 times 8 dw 4096 ; 13
+ cwa8192 times 8 dw 8192 ; 14
+ cwa16384 times 8 dw 16384 ; 15
+
+section .text
+
+%macro PROC 1
+ align 16
+ global %1
+ %1:
+%endmacro
+
+;******************************************************************************
+; source 16 bit signed, 16 pixel width
+rfx_dwt_2d_encode_block_horiz_16_16:
+ mov ecx, 8
+loop1a:
+ ; pre / post
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, xmm10
+ psraw xmm6, xmm11
+ movdqa [rdx], xmm6
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ ; move left
+ lea rsi, [rsi - 16 * 2]
+ lea rdi, [rdi - 8 * 2]
+ lea rdx, [rdx - 8 * 2]
+
+ ; move down
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ dec ecx
+ jnz loop1a
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 16 pixel width
+rfx_dwt_2d_encode_block_verti_16_16:
+ mov ecx, 2
+loop1b:
+ ; pre
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16 * 2] ; src[2n + 1]
+ movdqa xmm3, [rsi + 16 * 2 * 2] ; src[2n + 2]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [rdi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm1
+ movdqa [rdx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea rsi, [rsi + 16 * 2 * 2] ; 2 rows
+ lea rdi, [rdi + 16 * 2] ; 1 row
+ lea rdx, [rdx + 16 * 2] ; 1 row
+
+ ; loop
+ shl ecx, 16
+ mov cx, 6
+loop2b:
+ movdqa xmm1, xmm3 ; src[2n]
+ movdqa xmm2, [rsi + 16 * 2] ; src[2n + 1]
+ movdqa xmm3, [rsi + 16 * 2 * 2] ; src[2n + 2]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [rdi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [rdx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea rsi, [rsi + 16 * 2 * 2] ; 2 rows
+ lea rdi, [rdi + 16 * 2] ; 1 row
+ lea rdx, [rdx + 16 * 2] ; 1 row
+
+ dec cx
+ jnz loop2b
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, xmm3 ; src[2n]
+ movdqa xmm2, [rsi + 16 * 2] ; src[2n + 1]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [rdi], xmm5 ; out hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [rdx], xmm5 ; out lo
+ ; move down
+ lea rsi, [rsi + 16 * 2 * 2] ; 2 row
+ lea rdi, [rdi + 16 * 2] ; 1 row
+ lea rdx, [rdx + 16 * 2] ; 1 row
+
+ ; move up
+ lea rsi, [rsi - 16 * 16 * 2]
+ lea rdi, [rdi - 8 * 16 * 2]
+ lea rdx, [rdx - 8 * 16 * 2]
+
+ ; move right
+ lea rsi, [rsi + 16]
+ lea rdi, [rdi + 16]
+ lea rdx, [rdx + 16]
+
+ dec ecx
+ jnz loop1b
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_horiz_16_32:
+ mov ecx, 16
+loop1c:
+ ; pre
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [rsi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, xmm10
+ psraw xmm6, xmm11
+ movdqa [rdx], xmm6
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ ; post
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, xmm10
+ psraw xmm6, xmm11
+ movdqa [rdx], xmm6
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ ; move left
+ lea rsi, [rsi - 32 * 2]
+ lea rdi, [rdi - 16 * 2]
+ lea rdx, [rdx - 16 * 2]
+
+ ; move down
+ lea rsi, [rsi + 32 * 2]
+ lea rdi, [rdi + 16 * 2]
+ lea rdx, [rdx + 16 * 2]
+
+ dec ecx
+ jnz loop1c
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_horiz_16_32_no_lo:
+ mov ecx, 16
+loop1c1:
+ ; pre
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [rsi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa [rdx], xmm5 ; out lo
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ ; post
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa [rdx], xmm5 ; out lo
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ ; move left
+ lea rsi, [rsi - 32 * 2]
+ lea rdi, [rdi - 16 * 2]
+ lea rdx, [rdx - 16 * 2]
+
+ ; move down
+ lea rsi, [rsi + 32 * 2]
+ lea rdi, [rdi + 16 * 2]
+ lea rdx, [rdx + 16 * 2]
+
+ dec ecx
+ jnz loop1c1
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_verti_16_32:
+ mov ecx, 4
+loop1d:
+ ; pre
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 32 * 2] ; src[2n + 1]
+ movdqa xmm3, [rsi + 32 * 2 * 2] ; src[2n + 2]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [rdi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm1
+ movdqa [rdx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea rsi, [rsi + 32 * 2 * 2] ; 2 rows
+ lea rdi, [rdi + 32 * 2] ; 1 row
+ lea rdx, [rdx + 32 * 2] ; 1 row
+
+ ; loop
+ shl ecx, 16
+ mov cx, 14
+loop2d:
+ movdqa xmm1, xmm3 ; src[2n]
+ movdqa xmm2, [rsi + 32 * 2] ; src[2n + 1]
+ movdqa xmm3, [rsi + 32 * 2 * 2] ; src[2n + 2]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [rdi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [rdx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea rsi, [rsi + 32 * 2 * 2] ; 2 rows
+ lea rdi, [rdi + 32 * 2] ; 1 row
+ lea rdx, [rdx + 32 * 2] ; 1 row
+
+ dec cx
+ jnz loop2d
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, xmm3 ; src[2n]
+ movdqa xmm2, [rsi + 32 * 2] ; src[2n + 1]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [rdi], xmm5 ; out hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [rdx], xmm5 ; out lo
+ ; move down
+ lea rsi, [rsi + 32 * 2 * 2] ; 2 row
+ lea rdi, [rdi + 32 * 2] ; 1 row
+ lea rdx, [rdx + 32 * 2] ; 1 row
+
+ ; move up
+ lea rsi, [rsi - 32 * 32 * 2]
+ lea rdi, [rdi - 16 * 32 * 2]
+ lea rdx, [rdx - 16 * 32 * 2]
+
+ ; move right
+ lea rsi, [rsi + 16]
+ lea rdi, [rdi + 16]
+ lea rdx, [rdx + 16]
+
+ dec ecx
+ jnz loop1d
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 64 pixel width
+rfx_dwt_2d_encode_block_horiz_16_64:
+ mov ecx, 32
+loop1e:
+ ; pre
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [rsi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, xmm10
+ psraw xmm6, xmm11
+ movdqa [rdx], xmm6
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ ; loop
+ shl ecx, 16
+ mov cx, 2
+loop2e:
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [rsi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, xmm10
+ psraw xmm6, xmm11
+ movdqa [rdx], xmm6
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ dec cx
+ jnz loop2e
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, xmm10
+ psraw xmm6, xmm11
+ movdqa [rdx], xmm6
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ ; move left
+ lea rsi, [rsi - 64 * 2]
+ lea rdi, [rdi - 32 * 2]
+ lea rdx, [rdx - 32 * 2]
+
+ ; move down
+ lea rsi, [rsi + 64 * 2]
+ lea rdi, [rdi + 32 * 2]
+ lea rdx, [rdx + 32 * 2]
+
+ dec ecx
+ jnz loop1e
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 64 pixel width
+rfx_dwt_2d_encode_block_horiz_16_64_no_lo:
+ mov ecx, 32
+loop1e1:
+ ; pre
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [rsi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa [rdx], xmm5 ; out lo
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ ; loop
+ shl ecx, 16
+ mov cx, 2
+loop2e1:
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [rsi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa [rdx], xmm5 ; out lo
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ dec cx
+ jnz loop2e1
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa [rdx], xmm5 ; out lo
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ ; move left
+ lea rsi, [rsi - 64 * 2]
+ lea rdi, [rdi - 32 * 2]
+ lea rdx, [rdx - 32 * 2]
+
+ ; move down
+ lea rsi, [rsi + 64 * 2]
+ lea rdi, [rdi + 32 * 2]
+ lea rdx, [rdx + 32 * 2]
+
+ dec ecx
+ jnz loop1e1
+
+ ret
+
+;******************************************************************************
+; source 8 bit unsigned, 64 pixel width
+rfx_dwt_2d_encode_block_verti_8_64:
+ mov ecx, 8
+loop1f:
+ ; pre
+ movq xmm1, [rsi] ; src[2n]
+ movq xmm2, [rsi + 64 * 1] ; src[2n + 1]
+ movq xmm3, [rsi + 64 * 1 * 2] ; src[2n + 2]
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+ psubw xmm1, [rel cw128]
+ psubw xmm2, [rel cw128]
+ psubw xmm3, [rel cw128]
+ psllw xmm1, 5
+ psllw xmm2, 5
+ psllw xmm3, 5
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [rdi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm1
+ movdqa [rdx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea rsi, [rsi + 64 * 1 * 2] ; 2 rows
+ lea rdi, [rdi + 64 * 2] ; 1 row
+ lea rdx, [rdx + 64 * 2] ; 1 row
+
+ ; loop
+ shl ecx, 16
+ mov cx, 30
+loop2f:
+ movdqa xmm1, xmm3 ; src[2n]
+ movq xmm2, [rsi + 64 * 1] ; src[2n + 1]
+ movq xmm3, [rsi + 64 * 1 * 2] ; src[2n + 2]
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+ psubw xmm2, [rel cw128]
+ psubw xmm3, [rel cw128]
+ psllw xmm2, 5
+ psllw xmm3, 5
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [rdi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [rdx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea rsi, [rsi + 64 * 1 * 2] ; 2 rows
+ lea rdi, [rdi + 64 * 2] ; 1 row
+ lea rdx, [rdx + 64 * 2] ; 1 row
+
+ dec cx
+ jnz loop2f
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, xmm3 ; src[2n]
+ movq xmm2, [rsi + 64 * 1] ; src[2n + 1]
+ punpcklbw xmm2, xmm0
+ psubw xmm2, [rel cw128]
+ psllw xmm2, 5
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [rdi], xmm5 ; out hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [rdx], xmm5 ; out lo
+ ; move down
+ lea rsi, [rsi + 64 * 1 * 2] ; 2 rows
+ lea rdi, [rdi + 64 * 2] ; 1 row
+ lea rdx, [rdx + 64 * 2] ; 1 row
+
+ ; move up
+ lea rsi, [rsi - 64 * 1 * 64]
+ lea rdi, [rdi - 32 * 64 * 2]
+ lea rdx, [rdx - 32 * 64 * 2]
+
+ ; move right
+ lea rsi, [rsi + 8]
+ lea rdi, [rdi + 16]
+ lea rdx, [rdx + 16]
+
+ dec ecx
+ jnz loop1f
+
+ ret
+
+set_quants_hi:
+ sub rax, 6 - 5
+ movd xmm9, eax
+ imul rax, 16
+ lea rdx, [rel cwa0]
+ add rdx, rax
+ movdqa xmm8, [rdx]
+ ret
+
+set_quants_lo:
+ sub rax, 6 - 5
+ movd xmm11, eax
+ imul rax, 16
+ lea rdx, [rel cwa0]
+ add rdx, rax
+ movdqa xmm10, [rdx]
+ ret
+
+;The first six integer or pointer arguments are passed in registers
+;RDI, RSI, RDX, RCX, R8, and R9
+
+;int
+;rfxcodec_encode_dwt_shift_amd64_sse2(const char *qtable,
+; unsigned char *in_buffer,
+; short *out_buffer,
+; short *work_buffer);
+
+;******************************************************************************
+%ifidn __OUTPUT_FORMAT__,elf64
+PROC rfxcodec_encode_dwt_shift_amd64_sse2
+%else
+PROC _rfxcodec_encode_dwt_shift_amd64_sse2
+%endif
+ ; save registers
+ push rbx
+ push rdx
+ push rcx
+ push rsi
+ push rdi
+ pxor xmm0, xmm0
+
+ ; verical DWT to work buffer, level 1
+ mov rsi, [rsp + 8] ; src
+ mov rdi, [rsp + 16] ; dst hi
+ lea rdi, [rdi + 64 * 32 * 2] ; dst hi
+ mov rdx, [rsp + 16] ; dst lo
+ call rfx_dwt_2d_encode_block_verti_8_64
+
+ ; horizontal DWT to out buffer, level 1, part 1
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 4]
+ and al, 0xF
+ call set_quants_hi
+ mov rsi, [rsp + 16] ; src
+ mov rdi, [rsp + 24] ; dst hi - HL1
+ mov rdx, [rsp + 24] ; dst lo - LL1
+ lea rdx, [rdx + 32 * 32 * 6] ; dst lo - LL1
+ call rfx_dwt_2d_encode_block_horiz_16_64_no_lo
+
+ ; horizontal DWT to out buffer, level 1, part 2
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 4]
+ shr al, 4
+ call set_quants_hi
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 3]
+ shr al, 4
+ call set_quants_lo
+ mov rsi, [rsp + 16] ; src
+ lea rsi, [rsi + 64 * 32 * 2] ; src
+ mov rdi, [rsp + 24] ; dst hi - HH1
+ lea rdi, [rdi + 32 * 32 * 4] ; dst hi - HH1
+ mov rdx, [rsp + 24] ; dst lo - LH1
+ lea rdx, [rdx + 32 * 32 * 2] ; dst lo - LH1
+ call rfx_dwt_2d_encode_block_horiz_16_64
+
+ ; verical DWT to work buffer, level 2
+ mov rsi, [rsp + 24] ; src
+ lea rsi, [rsi + 32 * 32 * 6] ; src
+ mov rdi, [rsp + 16] ; dst hi
+ lea rdi, [rdi + 32 * 16 * 2] ; dst hi
+ mov rdx, [rsp + 16] ; dst lo
+ call rfx_dwt_2d_encode_block_verti_16_32
+
+ ; horizontal DWT to out buffer, level 2, part 1
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 2]
+ shr al, 4
+ call set_quants_hi
+ mov rsi, [rsp + 16] ; src
+ ; 32 * 32 * 6 + 16 * 16 * 0 = 6144
+ mov rdi, [rsp + 24] ; dst hi - HL2
+ lea rdi, [rdi + 6144] ; dst hi - HL2
+ ; 32 * 32 * 6 + 16 * 16 * 6 = 7680
+ mov rdx, [rsp + 24] ; dst lo - LL2
+ lea rdx, [rdx + 7680] ; dst lo - LL2
+ call rfx_dwt_2d_encode_block_horiz_16_32_no_lo
+
+ ; horizontal DWT to out buffer, level 2, part 2
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 3]
+ and al, 0xF
+ call set_quants_hi
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 2]
+ and al, 0xF
+ call set_quants_lo
+ mov rsi, [rsp + 16] ; src
+ lea rsi, [rsi + 32 * 16 * 2] ; src
+ ; 32 * 32 * 6 + 16 * 16 * 4 = 7168
+ mov rdi, [rsp + 24] ; dst hi - HH2
+ lea rdi, [rdi + 7168] ; dst hi - HH2
+ ; 32 * 32 * 6 + 16 * 16 * 2 = 6656
+ mov rdx, [rsp + 24] ; dst lo - LH2
+ lea rdx, [rdx + 6656] ; dst lo - LH2
+ call rfx_dwt_2d_encode_block_horiz_16_32
+
+ ; verical DWT to work buffer, level 3
+ ; 32 * 32 * 6 + 16 * 16 * 6 = 7680
+ mov rsi, [rsp + 24] ; src
+ lea rsi, [rsi + 7680] ; src
+ mov rdi, [rsp + 16] ; dst hi
+ lea rdi, [rdi + 16 * 8 * 2] ; dst hi
+ mov rdx, [rsp + 16] ; dst lo
+ call rfx_dwt_2d_encode_block_verti_16_16
+
+ ; horizontal DWT to out buffer, level 3, part 1
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 1]
+ and al, 0xF
+ call set_quants_hi
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 0]
+ and al, 0xF
+ call set_quants_lo
+ mov rsi, [rsp + 16] ; src
+ ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 0 = 7680
+ mov rdi, [rsp + 24] ; dst hi - HL3
+ lea rdi, [rdi + 7680] ; dst hi - HL3
+ ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 6 = 8064
+ mov rdx, [rsp + 24] ; dst lo - LL3
+ lea rdx, [rdx + 8064] ; dst lo - LL3
+ call rfx_dwt_2d_encode_block_horiz_16_16
+
+ ; horizontal DWT to out buffer, level 3, part 2
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 1]
+ shr al, 4
+ call set_quants_hi
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 0]
+ shr al, 4
+ call set_quants_lo
+ mov rsi, [rsp + 16] ; src
+ lea rsi, [rsi + 16 * 8 * 2] ; src
+ ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 4 = 7936
+ mov rdi, [rsp + 24] ; dst hi - HH3
+ lea rdi, [rdi + 7936] ; dst hi - HH3
+ ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 2 = 7808
+ mov rdx, [rsp + 24] ; dst lo - LH3
+ lea rdx, [rdx + 7808] ; dst lo - LH3
+ call rfx_dwt_2d_encode_block_horiz_16_16
+
+ mov rax, 0
+ ; restore registers
+ pop rdi
+ pop rsi
+ pop rcx
+ pop rdx
+ pop rbx
+ ret
+ align 16
+
diff --git a/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm
new file mode 100644
index 0000000..ab52808
--- /dev/null
+++ b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm
@@ -0,0 +1,1371 @@
+;
+;Copyright 2016 Jay Sorg
+;
+;Permission to use, copy, modify, distribute, and sell this software and its
+;documentation for any purpose is hereby granted without fee, provided that
+;the above copyright notice appear in all copies and that both that
+;copyright notice and this permission notice appear in supporting
+;documentation.
+;
+;The above copyright notice and this permission notice shall be included in
+;all copies or substantial portions of the Software.
+;
+;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+;
+;amd64 asm dwt
+
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+section .data
+ align 16
+ cw128 times 8 dw 128
+ cdFFFF times 4 dd 65535
+ ; these are 1 << (factor - 1) 0 to 15 is factor
+ cwa0 times 8 dw 0 ; 0
+ cwa1 times 8 dw 1 ; 1
+ cwa2 times 8 dw 2 ; 2
+ cwa4 times 8 dw 4 ; 3
+ cwa8 times 8 dw 8 ; 4
+ cwa16 times 8 dw 16 ; 5
+ cwa32 times 8 dw 32 ; 6
+ cwa64 times 8 dw 64 ; 7
+ cwa128 times 8 dw 128 ; 8
+ cwa256 times 8 dw 256 ; 9
+ cwa512 times 8 dw 512 ; 10
+ cwa1024 times 8 dw 1024 ; 11
+ cwa2048 times 8 dw 2048 ; 12
+ cwa4096 times 8 dw 4096 ; 13
+ cwa8192 times 8 dw 8192 ; 14
+ cwa16384 times 8 dw 16384 ; 15
+
+section .text
+
+%macro PROC 1
+ align 16
+ global %1
+ %1:
+%endmacro
+
+;******************************************************************************
+; source 16 bit signed, 16 pixel width
+rfx_dwt_2d_encode_block_horiz_16_16:
+ mov ecx, 8
+loop1a:
+ ; pre / post
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, xmm10
+ psraw xmm6, xmm11
+ movdqa [rdx], xmm6
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ ; move left
+ lea rsi, [rsi - 16 * 2]
+ lea rdi, [rdi - 8 * 2]
+ lea rdx, [rdx - 8 * 2]
+
+ ; move down
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ dec ecx
+ jnz loop1a
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 16 pixel width
+rfx_dwt_2d_encode_block_verti_16_16:
+ mov ecx, 2
+loop1b:
+ ; pre
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16 * 2] ; src[2n + 1]
+ movdqa xmm3, [rsi + 16 * 2 * 2] ; src[2n + 2]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [rdi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm1
+ movdqa [rdx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea rsi, [rsi + 16 * 2 * 2] ; 2 rows
+ lea rdi, [rdi + 16 * 2] ; 1 row
+ lea rdx, [rdx + 16 * 2] ; 1 row
+
+ ; loop
+ shl ecx, 16
+ mov cx, 6
+loop2b:
+ movdqa xmm1, xmm3 ; src[2n]
+ movdqa xmm2, [rsi + 16 * 2] ; src[2n + 1]
+ movdqa xmm3, [rsi + 16 * 2 * 2] ; src[2n + 2]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [rdi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [rdx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea rsi, [rsi + 16 * 2 * 2] ; 2 rows
+ lea rdi, [rdi + 16 * 2] ; 1 row
+ lea rdx, [rdx + 16 * 2] ; 1 row
+
+ dec cx
+ jnz loop2b
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, xmm3 ; src[2n]
+ movdqa xmm2, [rsi + 16 * 2] ; src[2n + 1]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [rdi], xmm5 ; out hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [rdx], xmm5 ; out lo
+ ; move down
+ lea rsi, [rsi + 16 * 2 * 2] ; 2 row
+ lea rdi, [rdi + 16 * 2] ; 1 row
+ lea rdx, [rdx + 16 * 2] ; 1 row
+
+ ; move up
+ lea rsi, [rsi - 16 * 16 * 2]
+ lea rdi, [rdi - 8 * 16 * 2]
+ lea rdx, [rdx - 8 * 16 * 2]
+
+ ; move right
+ lea rsi, [rsi + 16]
+ lea rdi, [rdi + 16]
+ lea rdx, [rdx + 16]
+
+ dec ecx
+ jnz loop1b
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_horiz_16_32:
+ mov ecx, 16
+loop1c:
+ ; pre
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [rsi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, xmm10
+ psraw xmm6, xmm11
+ movdqa [rdx], xmm6
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ ; post
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, xmm10
+ psraw xmm6, xmm11
+ movdqa [rdx], xmm6
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ ; move left
+ lea rsi, [rsi - 32 * 2]
+ lea rdi, [rdi - 16 * 2]
+ lea rdx, [rdx - 16 * 2]
+
+ ; move down
+ lea rsi, [rsi + 32 * 2]
+ lea rdi, [rdi + 16 * 2]
+ lea rdx, [rdx + 16 * 2]
+
+ dec ecx
+ jnz loop1c
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_horiz_16_32_no_lo:
+ mov ecx, 16
+loop1c1:
+ ; pre
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [rsi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa [rdx], xmm5 ; out lo
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ ; post
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa [rdx], xmm5 ; out lo
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ ; move left
+ lea rsi, [rsi - 32 * 2]
+ lea rdi, [rdi - 16 * 2]
+ lea rdx, [rdx - 16 * 2]
+
+ ; move down
+ lea rsi, [rsi + 32 * 2]
+ lea rdi, [rdi + 16 * 2]
+ lea rdx, [rdx + 16 * 2]
+
+ dec ecx
+ jnz loop1c1
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_verti_16_32:
+ mov ecx, 4
+loop1d:
+ ; pre
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 32 * 2] ; src[2n + 1]
+ movdqa xmm3, [rsi + 32 * 2 * 2] ; src[2n + 2]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [rdi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm1
+ movdqa [rdx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea rsi, [rsi + 32 * 2 * 2] ; 2 rows
+ lea rdi, [rdi + 32 * 2] ; 1 row
+ lea rdx, [rdx + 32 * 2] ; 1 row
+
+ ; loop
+ shl ecx, 16
+ mov cx, 14
+loop2d:
+ movdqa xmm1, xmm3 ; src[2n]
+ movdqa xmm2, [rsi + 32 * 2] ; src[2n + 1]
+ movdqa xmm3, [rsi + 32 * 2 * 2] ; src[2n + 2]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [rdi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [rdx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea rsi, [rsi + 32 * 2 * 2] ; 2 rows
+ lea rdi, [rdi + 32 * 2] ; 1 row
+ lea rdx, [rdx + 32 * 2] ; 1 row
+
+ dec cx
+ jnz loop2d
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, xmm3 ; src[2n]
+ movdqa xmm2, [rsi + 32 * 2] ; src[2n + 1]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [rdi], xmm5 ; out hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [rdx], xmm5 ; out lo
+ ; move down
+ lea rsi, [rsi + 32 * 2 * 2] ; 2 row
+ lea rdi, [rdi + 32 * 2] ; 1 row
+ lea rdx, [rdx + 32 * 2] ; 1 row
+
+ ; move up
+ lea rsi, [rsi - 32 * 32 * 2]
+ lea rdi, [rdi - 16 * 32 * 2]
+ lea rdx, [rdx - 16 * 32 * 2]
+
+ ; move right
+ lea rsi, [rsi + 16]
+ lea rdi, [rdi + 16]
+ lea rdx, [rdx + 16]
+
+ dec ecx
+ jnz loop1d
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 64 pixel width
+rfx_dwt_2d_encode_block_horiz_16_64:
+ mov ecx, 32
+loop1e:
+ ; pre
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [rsi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, xmm10
+ psraw xmm6, xmm11
+ movdqa [rdx], xmm6
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ ; loop
+ shl ecx, 16
+ mov cx, 2
+loop2e:
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [rsi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, xmm10
+ psraw xmm6, xmm11
+ movdqa [rdx], xmm6
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ dec cx
+ jnz loop2e
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, xmm10
+ psraw xmm6, xmm11
+ movdqa [rdx], xmm6
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ ; move left
+ lea rsi, [rsi - 64 * 2]
+ lea rdi, [rdi - 32 * 2]
+ lea rdx, [rdx - 32 * 2]
+
+ ; move down
+ lea rsi, [rsi + 64 * 2]
+ lea rdi, [rdi + 32 * 2]
+ lea rdx, [rdx + 32 * 2]
+
+ dec ecx
+ jnz loop1e
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 64 pixel width
+rfx_dwt_2d_encode_block_horiz_16_64_no_lo:
+ mov ecx, 32
+loop1e1:
+ ; pre
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [rsi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa [rdx], xmm5 ; out lo
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ ; loop
+ shl ecx, 16
+ mov cx, 2
+loop2e1:
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [rsi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa [rdx], xmm5 ; out lo
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ dec cx
+ jnz loop2e1
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, [rsi] ; src[2n]
+ movdqa xmm2, [rsi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [rel cdFFFF]
+ pand xmm2, [rel cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [rel cdFFFF]
+ pand xmm3, [rel cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [rel cdFFFF]
+ pand xmm4, [rel cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, xmm8
+ psraw xmm6, xmm9
+ movdqa [rdi], xmm6
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa [rdx], xmm5 ; out lo
+
+ ; move right
+ lea rsi, [rsi + 16 * 2]
+ lea rdi, [rdi + 8 * 2]
+ lea rdx, [rdx + 8 * 2]
+
+ ; move left
+ lea rsi, [rsi - 64 * 2]
+ lea rdi, [rdi - 32 * 2]
+ lea rdx, [rdx - 32 * 2]
+
+ ; move down
+ lea rsi, [rsi + 64 * 2]
+ lea rdi, [rdi + 32 * 2]
+ lea rdx, [rdx + 32 * 2]
+
+ dec ecx
+ jnz loop1e1
+
+ ret
+
+;******************************************************************************
+; source 8 bit unsigned, 64 pixel width
+rfx_dwt_2d_encode_block_verti_8_64:
+ mov ecx, 8
+loop1f:
+ ; pre
+ movq xmm1, [rsi] ; src[2n]
+ movq xmm2, [rsi + 64 * 1] ; src[2n + 1]
+ movq xmm3, [rsi + 64 * 1 * 2] ; src[2n + 2]
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+ psubw xmm1, [rel cw128]
+ psubw xmm2, [rel cw128]
+ psubw xmm3, [rel cw128]
+ psllw xmm1, 5
+ psllw xmm2, 5
+ psllw xmm3, 5
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [rdi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm1
+ movdqa [rdx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea rsi, [rsi + 64 * 1 * 2] ; 2 rows
+ lea rdi, [rdi + 64 * 2] ; 1 row
+ lea rdx, [rdx + 64 * 2] ; 1 row
+
+ ; loop
+ shl ecx, 16
+ mov cx, 30
+loop2f:
+ movdqa xmm1, xmm3 ; src[2n]
+ movq xmm2, [rsi + 64 * 1] ; src[2n + 1]
+ movq xmm3, [rsi + 64 * 1 * 2] ; src[2n + 2]
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+ psubw xmm2, [rel cw128]
+ psubw xmm3, [rel cw128]
+ psllw xmm2, 5
+ psllw xmm3, 5
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [rdi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [rdx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea rsi, [rsi + 64 * 1 * 2] ; 2 rows
+ lea rdi, [rdi + 64 * 2] ; 1 row
+ lea rdx, [rdx + 64 * 2] ; 1 row
+
+ dec cx
+ jnz loop2f
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, xmm3 ; src[2n]
+ movq xmm2, [rsi + 64 * 1] ; src[2n + 1]
+ punpcklbw xmm2, xmm0
+ psubw xmm2, [rel cw128]
+ psllw xmm2, 5
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [rdi], xmm5 ; out hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [rdx], xmm5 ; out lo
+ ; move down
+ lea rsi, [rsi + 64 * 1 * 2] ; 2 rows
+ lea rdi, [rdi + 64 * 2] ; 1 row
+ lea rdx, [rdx + 64 * 2] ; 1 row
+
+ ; move up
+ lea rsi, [rsi - 64 * 1 * 64]
+ lea rdi, [rdi - 32 * 64 * 2]
+ lea rdx, [rdx - 32 * 64 * 2]
+
+ ; move right
+ lea rsi, [rsi + 8]
+ lea rdi, [rdi + 16]
+ lea rdx, [rdx + 16]
+
+ dec ecx
+ jnz loop1f
+
+ ret
+
+set_quants_hi:
+ sub rax, 6 - 5
+ movd xmm9, eax
+ imul rax, 16
+ lea rdx, [rel cwa0]
+ add rdx, rax
+ movdqa xmm8, [rdx]
+ ret
+
+set_quants_lo:
+ sub rax, 6 - 5
+ movd xmm11, eax
+ imul rax, 16
+ lea rdx, [rel cwa0]
+ add rdx, rax
+ movdqa xmm10, [rdx]
+ ret
+
+;The first six integer or pointer arguments are passed in registers
+;RDI, RSI, RDX, RCX, R8, and R9
+
+;int
+;rfxcodec_encode_dwt_shift_amd64_sse41(const char *qtable,
+; unsigned char *in_buffer,
+; short *out_buffer,
+; short *work_buffer);
+
+;******************************************************************************
+%ifidn __OUTPUT_FORMAT__,elf64
+PROC rfxcodec_encode_dwt_shift_amd64_sse41
+%else
+PROC _rfxcodec_encode_dwt_shift_amd64_sse41
+%endif
+ ; save registers
+ push rbx
+ push rdx
+ push rcx
+ push rsi
+ push rdi
+ pxor xmm0, xmm0
+
+ ; verical DWT to work buffer, level 1
+ mov rsi, [rsp + 8] ; src
+ mov rdi, [rsp + 16] ; dst hi
+ lea rdi, [rdi + 64 * 32 * 2] ; dst hi
+ mov rdx, [rsp + 16] ; dst lo
+ call rfx_dwt_2d_encode_block_verti_8_64
+
+ ; horizontal DWT to out buffer, level 1, part 1
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 4]
+ and al, 0xF
+ call set_quants_hi
+ mov rsi, [rsp + 16] ; src
+ mov rdi, [rsp + 24] ; dst hi - HL1
+ mov rdx, [rsp + 24] ; dst lo - LL1
+ lea rdx, [rdx + 32 * 32 * 6] ; dst lo - LL1
+ call rfx_dwt_2d_encode_block_horiz_16_64_no_lo
+
+ ; horizontal DWT to out buffer, level 1, part 2
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 4]
+ shr al, 4
+ call set_quants_hi
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 3]
+ shr al, 4
+ call set_quants_lo
+ mov rsi, [rsp + 16] ; src
+ lea rsi, [rsi + 64 * 32 * 2] ; src
+ mov rdi, [rsp + 24] ; dst hi - HH1
+ lea rdi, [rdi + 32 * 32 * 4] ; dst hi - HH1
+ mov rdx, [rsp + 24] ; dst lo - LH1
+ lea rdx, [rdx + 32 * 32 * 2] ; dst lo - LH1
+ call rfx_dwt_2d_encode_block_horiz_16_64
+
+ ; verical DWT to work buffer, level 2
+ mov rsi, [rsp + 24] ; src
+ lea rsi, [rsi + 32 * 32 * 6] ; src
+ mov rdi, [rsp + 16] ; dst hi
+ lea rdi, [rdi + 32 * 16 * 2] ; dst hi
+ mov rdx, [rsp + 16] ; dst lo
+ call rfx_dwt_2d_encode_block_verti_16_32
+
+ ; horizontal DWT to out buffer, level 2, part 1
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 2]
+ shr al, 4
+ call set_quants_hi
+ mov rsi, [rsp + 16] ; src
+ ; 32 * 32 * 6 + 16 * 16 * 0 = 6144
+ mov rdi, [rsp + 24] ; dst hi - HL2
+ lea rdi, [rdi + 6144] ; dst hi - HL2
+ ; 32 * 32 * 6 + 16 * 16 * 6 = 7680
+ mov rdx, [rsp + 24] ; dst lo - LL2
+ lea rdx, [rdx + 7680] ; dst lo - LL2
+ call rfx_dwt_2d_encode_block_horiz_16_32_no_lo
+
+ ; horizontal DWT to out buffer, level 2, part 2
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 3]
+ and al, 0xF
+ call set_quants_hi
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 2]
+ and al, 0xF
+ call set_quants_lo
+ mov rsi, [rsp + 16] ; src
+ lea rsi, [rsi + 32 * 16 * 2] ; src
+ ; 32 * 32 * 6 + 16 * 16 * 4 = 7168
+ mov rdi, [rsp + 24] ; dst hi - HH2
+ lea rdi, [rdi + 7168] ; dst hi - HH2
+ ; 32 * 32 * 6 + 16 * 16 * 2 = 6656
+ mov rdx, [rsp + 24] ; dst lo - LH2
+ lea rdx, [rdx + 6656] ; dst lo - LH2
+ call rfx_dwt_2d_encode_block_horiz_16_32
+
+ ; verical DWT to work buffer, level 3
+ ; 32 * 32 * 6 + 16 * 16 * 6 = 7680
+ mov rsi, [rsp + 24] ; src
+ lea rsi, [rsi + 7680] ; src
+ mov rdi, [rsp + 16] ; dst hi
+ lea rdi, [rdi + 16 * 8 * 2] ; dst hi
+ mov rdx, [rsp + 16] ; dst lo
+ call rfx_dwt_2d_encode_block_verti_16_16
+
+ ; horizontal DWT to out buffer, level 3, part 1
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 1]
+ and al, 0xF
+ call set_quants_hi
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 0]
+ and al, 0xF
+ call set_quants_lo
+ mov rsi, [rsp + 16] ; src
+ ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 0 = 7680
+ mov rdi, [rsp + 24] ; dst hi - HL3
+ lea rdi, [rdi + 7680] ; dst hi - HL3
+ ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 6 = 8064
+ mov rdx, [rsp + 24] ; dst lo - LL3
+ lea rdx, [rdx + 8064] ; dst lo - LL3
+ call rfx_dwt_2d_encode_block_horiz_16_16
+
+ ; horizontal DWT to out buffer, level 3, part 2
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 1]
+ shr al, 4
+ call set_quants_hi
+ xor rax, rax
+ mov rdx, [rsp]
+ mov al, [rdx + 0]
+ shr al, 4
+ call set_quants_lo
+ mov rsi, [rsp + 16] ; src
+ lea rsi, [rsi + 16 * 8 * 2] ; src
+ ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 4 = 7936
+ mov rdi, [rsp + 24] ; dst hi - HH3
+ lea rdi, [rdi + 7936] ; dst hi - HH3
+ ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 2 = 7808
+ mov rdx, [rsp + 24] ; dst lo - LH3
+ lea rdx, [rdx + 7808] ; dst lo - LH3
+ call rfx_dwt_2d_encode_block_horiz_16_16
+
+ mov rax, 0
+ ; restore registers
+ pop rdi
+ pop rsi
+ pop rcx
+ pop rdx
+ pop rbx
+ ret
+ align 16
+
diff --git a/src/amd64/rfxdwt_amd64_sse2.asm b/src/amd64/rfxdwt_amd64_sse2.asm
deleted file mode 100644
index 4648371..0000000
--- a/src/amd64/rfxdwt_amd64_sse2.asm
+++ /dev/null
@@ -1,21 +0,0 @@
-
-section .data
- const1 times 8 dw 1
-
-%macro PROC 1
- align 16
- global %1
- %1:
-%endmacro
-
-;int
-;dwt_shift_amd64_sse2(const int* qtable, sint8* src, sint16* dst, sint16* temp)
-
-PROC dwt_shift_amd64_sse2
- ; save registers
- push rbx
- mov rax, 0
- pop rbx
- ret
- align 16
-
diff --git a/src/amd64/rfxrlgr1_amd64.asm b/src/amd64/rfxrlgr1_amd64.asm
deleted file mode 100644
index 7c80678..0000000
--- a/src/amd64/rfxrlgr1_amd64.asm
+++ /dev/null
@@ -1,21 +0,0 @@
-
-section .data
- const1 times 8 dw 1
-
-%macro PROC 1
- align 16
- global %1
- %1:
-%endmacro
-
-;int
-;diff_rlgr1_amd64(sint16 *co, int num_co, uint8 *dst, int dst_bytes);
-
-PROC diff_rlgr1_amd64
- ; save registers
- push rbx
- mov rax, 0
- pop rbx
- ret
- align 16
-
diff --git a/src/amd64/rfxrlgr3_amd64.asm b/src/amd64/rfxrlgr3_amd64.asm
deleted file mode 100644
index 3270760..0000000
--- a/src/amd64/rfxrlgr3_amd64.asm
+++ /dev/null
@@ -1,21 +0,0 @@
-
-section .data
- const1 times 8 dw 1
-
-%macro PROC 1
- align 16
- global %1
- %1:
-%endmacro
-
-;int
-;diff_rlgr3_amd64(sint16 *co, int num_co, uint8 *dst, int dst_bytes);
-
-PROC diff_rlgr3_amd64
- ; save registers
- push rbx
- mov rax, 0
- pop rbx
- ret
- align 16
-
diff --git a/src/nasm_lt.sh b/src/nasm_lt.sh
new file mode 100755
index 0000000..6cd7329
--- /dev/null
+++ b/src/nasm_lt.sh
@@ -0,0 +1,57 @@
+#! /bin/sh
+command=""
+infile=""
+o_opt=no
+pic=no
+while [ $# -gt 0 ]; do
+ case "$1" in
+ -DPIC|-fPIC|-fpic|-Kpic|-KPIC)
+ if [ "$pic" != "yes" ] ; then
+ command="$command -DPIC"
+ pic=yes
+ fi
+ ;;
+ -f|-fbin|-faout|-faoutb|-fcoff|-felf|-felf64|-fas86| \
+ -fobj|-fwin32|-fwin64|-frdf|-fieee|-fmacho|-fmacho64)
+ # it's a file format specifier for nasm.
+ command="$command $1"
+ ;;
+ -f*)
+ # maybe a code-generation flag for gcc.
+ ;;
+ -[Ii]*)
+ incdir=`echo "$1" | sed 's/^-[Ii]//'`
+ if [ "x$incdir" = x -a "x$2" != x ] ; then
+ case "$2" in
+ -*) ;;
+ *) incdir="$2"; shift;;
+ esac
+ fi
+ if [ "x$incdir" != x ] ; then
+ # In the case of NASM, the trailing slash is necessary.
+ incdir=`echo "$incdir" | sed 's%/*$%/%'`
+ command="$command -I$incdir"
+ fi
+ ;;
+ -o*)
+ o_opt=yes
+ command="$command $1"
+ ;;
+ *.asm)
+ infile=$1
+ command="$command $1"
+ ;;
+ *)
+ command="$command $1"
+ ;;
+ esac
+ shift
+done
+if [ "$o_opt" != yes ] ; then
+ # By default, NASM creates an output file
+ # in the same directory as the input file.
+ outfile="-o `echo $infile | sed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.o"
+ command="$command $outfile"
+fi
+echo $command
+exec $command
diff --git a/src/rfxcommon.h b/src/rfxcommon.h
index 6b4c6f8..74514e9 100644
--- a/src/rfxcommon.h
+++ b/src/rfxcommon.h
@@ -1,7 +1,7 @@
/**
* RFX codec
*
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -23,6 +23,8 @@
#define MAX(_val1, _val2) (_val1) > (_val2) ? (_val1) : (_val2)
#define MINMAX(_v, _l, _h) ((_v) < (_l) ? (_l) : ((_v) > (_h) ? (_h) : (_v)))
+#define DWT_FACTOR 5
+
typedef signed char sint8;
typedef unsigned char uint8;
typedef signed short sint16;
diff --git a/src/rfxcompose.c b/src/rfxcompose.c
index d3af2dd..f208a32 100644
--- a/src/rfxcompose.c
+++ b/src/rfxcompose.c
@@ -3,6 +3,7 @@
* RemoteFX Codec Library
*
* Copyright 2011 Vic Lee
+ * Copyright 2015 Jay Sorg <jay.sorg@gmail.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -28,12 +29,16 @@
#include "rfxconstants.h"
#include "rfxencode_tile.h"
+#define LLOG_LEVEL 1
+#define LLOGLN(_level, _args) \
+ do { if (_level < LLOG_LEVEL) { printf _args ; printf("\n"); } } while (0)
+
/*
* LL3, LH3, HL3, HH3, LH2, HL2, HH2, LH1, HL1, HH1
*/
-static const int g_rfx_default_quantization_values[] =
+static const unsigned char g_rfx_default_quantization_values[] =
{
- 6, 6, 6, 6, 7, 7, 8, 8, 8, 9
+ 0x66, 0x66, 0x77, 0x88, 0x98
};
/******************************************************************************/
@@ -168,7 +173,7 @@ rfx_compose_message_frame_begin(struct rfxencode* enc, STREAM* s)
/******************************************************************************/
static int
rfx_compose_message_region(struct rfxencode* enc, STREAM* s,
- struct rfx_rect *regions, int num_regions)
+ const struct rfx_rect *regions, int num_regions)
{
int size;
int i;
@@ -200,7 +205,7 @@ rfx_compose_message_region(struct rfxencode* enc, STREAM* s,
static int
rfx_compose_message_tile_yuv(struct rfxencode *enc, STREAM *s,
char *tile_data, int tile_width, int tile_height,
- int stride_bytes, const int *quantVals,
+ int stride_bytes, const char *quantVals,
int quantIdxY, int quantIdxCb, int quantIdxCr,
int xIdx, int yIdx)
{
@@ -221,9 +226,9 @@ rfx_compose_message_tile_yuv(struct rfxencode *enc, STREAM *s,
stream_seek(s, 6); /* YLen, CbLen, CrLen */
if (rfx_encode_yuv(enc, tile_data, tile_width, tile_height,
stride_bytes,
- quantVals + quantIdxY * 10,
- quantVals + quantIdxCb * 10,
- quantVals + quantIdxCr * 10,
+ quantVals + quantIdxY * 5,
+ quantVals + quantIdxCb * 5,
+ quantVals + quantIdxCr * 5,
s, &YLen, &CbLen, &CrLen) != 0)
{
return 1;
@@ -241,9 +246,54 @@ rfx_compose_message_tile_yuv(struct rfxencode *enc, STREAM *s,
/******************************************************************************/
static int
+rfx_compose_message_tile_yuva(struct rfxencode *enc, STREAM *s,
+ char *tile_data, int tile_width, int tile_height,
+ int stride_bytes, const char *quantVals,
+ int quantIdxY, int quantIdxCb, int quantIdxCr,
+ int xIdx, int yIdx)
+{
+ int YLen = 0;
+ int CbLen = 0;
+ int CrLen = 0;
+ int ALen = 0;
+ int start_pos;
+ int end_pos;
+
+ start_pos = stream_get_pos(s);
+ stream_write_uint16(s, CBT_TILE); /* BlockT.blockType */
+ stream_seek_uint32(s); /* set BlockT.blockLen later */
+ stream_write_uint8(s, quantIdxY);
+ stream_write_uint8(s, quantIdxCb);
+ stream_write_uint8(s, quantIdxCr);
+ stream_write_uint16(s, xIdx);
+ stream_write_uint16(s, yIdx);
+ stream_seek(s, 8); /* YLen, CbLen, CrLen, ALen */
+ if (rfx_encode_yuva(enc, tile_data, tile_width, tile_height,
+ stride_bytes,
+ quantVals + quantIdxY * 5,
+ quantVals + quantIdxCb * 5,
+ quantVals + quantIdxCr * 5,
+ s, &YLen, &CbLen, &CrLen, &ALen) != 0)
+ {
+ return 1;
+ }
+ end_pos = stream_get_pos(s);
+ stream_set_pos(s, start_pos + 2);
+ stream_write_uint32(s, 19 + YLen + CbLen + CrLen + ALen); /* BlockT.blockLen */
+ stream_set_pos(s, start_pos + 13);
+ stream_write_uint16(s, YLen);
+ stream_write_uint16(s, CbLen);
+ stream_write_uint16(s, CrLen);
+ stream_write_uint16(s, ALen);
+ stream_set_pos(s, end_pos);
+ return 0;
+}
+
+/******************************************************************************/
+static int
rfx_compose_message_tile_rgb(struct rfxencode *enc, STREAM *s,
char *tile_data, int tile_width, int tile_height,
- int stride_bytes, const int *quantVals,
+ int stride_bytes, const char *quantVals,
int quantIdxY, int quantIdxCb, int quantIdxCr,
int xIdx, int yIdx)
{
@@ -264,9 +314,9 @@ rfx_compose_message_tile_rgb(struct rfxencode *enc, STREAM *s,
stream_seek(s, 6); /* YLen, CbLen, CrLen */
if (rfx_encode_rgb(enc, tile_data, tile_width, tile_height,
stride_bytes,
- quantVals + quantIdxY * 10,
- quantVals + quantIdxCb * 10,
- quantVals + quantIdxCr * 10,
+ quantVals + quantIdxY * 5,
+ quantVals + quantIdxCb * 5,
+ quantVals + quantIdxCr * 5,
s, &YLen, &CbLen, &CrLen) != 0)
{
return 1;
@@ -284,19 +334,66 @@ rfx_compose_message_tile_rgb(struct rfxencode *enc, STREAM *s,
/******************************************************************************/
static int
+rfx_compose_message_tile_argb(struct rfxencode *enc, STREAM *s,
+ char *tile_data, int tile_width, int tile_height,
+ int stride_bytes, const char *quantVals,
+ int quantIdxY, int quantIdxCb, int quantIdxCr,
+ int xIdx, int yIdx)
+{
+ int YLen = 0;
+ int CbLen = 0;
+ int CrLen = 0;
+ int ALen = 0;
+ int start_pos;
+ int end_pos;
+
+ LLOGLN(10, ("rfx_compose_message_tile_argb:"));
+ start_pos = stream_get_pos(s);
+ stream_write_uint16(s, CBT_TILE); /* BlockT.blockType */
+ stream_seek_uint32(s); /* set BlockT.blockLen later */
+ stream_write_uint8(s, quantIdxY);
+ stream_write_uint8(s, quantIdxCb);
+ stream_write_uint8(s, quantIdxCr);
+ stream_write_uint16(s, xIdx);
+ stream_write_uint16(s, yIdx);
+ stream_seek(s, 8); /* YLen, CbLen, CrLen, ALen */
+ if (rfx_encode_argb(enc, tile_data, tile_width, tile_height,
+ stride_bytes,
+ quantVals + quantIdxY * 5,
+ quantVals + quantIdxCb * 5,
+ quantVals + quantIdxCr * 5,
+ s, &YLen, &CbLen, &CrLen, &ALen) != 0)
+ {
+ LLOGLN(10, ("rfx_compose_message_tile_argb: rfx_encode_argb failed"));
+ return 1;
+ }
+ end_pos = stream_get_pos(s);
+ stream_set_pos(s, start_pos + 2);
+ stream_write_uint32(s, 19 + YLen + CbLen + CrLen + ALen); /* BlockT.blockLen */
+ stream_set_pos(s, start_pos + 13);
+ stream_write_uint16(s, YLen);
+ stream_write_uint16(s, CbLen);
+ stream_write_uint16(s, CrLen);
+ stream_write_uint16(s, ALen);
+ stream_set_pos(s, end_pos);
+ return 0;
+}
+
+/******************************************************************************/
+static int
rfx_compose_message_tileset(struct rfxencode* enc, STREAM* s,
char* buf, int width, int height,
int stride_bytes,
- struct rfx_tile *tiles, int num_tiles,
- const int *quants, int num_quants)
+ const struct rfx_tile *tiles, int num_tiles,
+ const char *quants, int num_quants,
+ int flags)
{
int size;
int start_pos;
int end_pos;
int index;
int numQuants;
- const int *quantVals;
- const int *quantValsPtr;
+ const char *quantVals;
int quantIdxY;
int quantIdxCb;
int quantIdxCr;
@@ -308,10 +405,11 @@ rfx_compose_message_tileset(struct rfxencode* enc, STREAM* s,
int cy;
char *tile_data;
+ LLOGLN(10, ("rfx_compose_message_tileset:"));
if (quants == 0)
{
numQuants = 1;
- quantVals = g_rfx_default_quantization_values;
+ quantVals = (const char *) g_rfx_default_quantization_values;
}
else
{
@@ -321,7 +419,15 @@ rfx_compose_message_tileset(struct rfxencode* enc, STREAM* s,
numTiles = num_tiles;
size = 22 + numQuants * 5;
start_pos = stream_get_pos(s);
- stream_write_uint16(s, WBT_EXTENSION); /* CodecChannelT.blockType */
+ if (flags & RFX_FLAGS_ALPHAV1)
+ {
+ LLOGLN(10, ("rfx_compose_message_tileset: RFX_FLAGS_ALPHAV1 set"));
+ stream_write_uint16(s, WBT_EXTENSION_PLUS); /* CodecChannelT.blockType */
+ }
+ else
+ {
+ stream_write_uint16(s, WBT_EXTENSION); /* CodecChannelT.blockType */
+ }
stream_seek_uint32(s); /* set CodecChannelT.blockLen later */
stream_write_uint8(s, 1); /* CodecChannelT.codecId */
stream_write_uint8(s, 0); /* CodecChannelT.channelId */
@@ -332,54 +438,100 @@ rfx_compose_message_tileset(struct rfxencode* enc, STREAM* s,
stream_write_uint8(s, 0x40); /* tileSize */
stream_write_uint16(s, numTiles); /* numTiles */
stream_seek_uint32(s); /* set tilesDataSize later */
- quantValsPtr = quantVals;
- for (index = 0; index < numQuants * 5; index++)
- {
- stream_write_uint8(s, quantValsPtr[0] + (quantValsPtr[1] << 4));
- quantValsPtr += 2;
- }
+ memcpy(s->p, quantVals, numQuants * 5);
+ s->p += numQuants * 5;
end_pos = stream_get_pos(s);
if (enc->format == RFX_FORMAT_YUV)
{
- for (index = 0; index < numTiles; index++)
+ if (flags & RFX_FLAGS_ALPHAV1)
+ {
+ for (index = 0; index < numTiles; index++)
+ {
+ x = tiles[index].x;
+ y = tiles[index].y;
+ cx = tiles[index].cx;
+ cy = tiles[index].cy;
+ quantIdxY = tiles[index].quant_y;
+ quantIdxCb = tiles[index].quant_cb;
+ quantIdxCr = tiles[index].quant_cr;
+ tile_data = buf + (y << 8) * (stride_bytes >> 8) + (x << 8);
+ if (rfx_compose_message_tile_yuva(enc, s,
+ tile_data, cx, cy, stride_bytes,
+ quantVals,
+ quantIdxY, quantIdxCb, quantIdxCr,
+ x / 64, y / 64) != 0)
+ {
+ return 1;
+ }
+ }
+ }
+ else
{
- x = tiles[index].x;
- y = tiles[index].y;
- cx = tiles[index].cx;
- cy = tiles[index].cy;
- quantIdxY = tiles[index].quant_y;
- quantIdxCb = tiles[index].quant_cb;
- quantIdxCr = tiles[index].quant_cr;
- tile_data = buf + (y << 8) * (stride_bytes >> 8) + (x << 8);
- if (rfx_compose_message_tile_yuv(enc, s,
- tile_data, cx, cy, stride_bytes,
- quantVals,
- quantIdxY, quantIdxCb, quantIdxCr,
- x / 64, y / 64) != 0)
+ for (index = 0; index < numTiles; index++)
{
- return 1;
+ x = tiles[index].x;
+ y = tiles[index].y;
+ cx = tiles[index].cx;
+ cy = tiles[index].cy;
+ quantIdxY = tiles[index].quant_y;
+ quantIdxCb = tiles[index].quant_cb;
+ quantIdxCr = tiles[index].quant_cr;
+ tile_data = buf + (y << 8) * (stride_bytes >> 8) + (x << 8);
+ if (rfx_compose_message_tile_yuv(enc, s,
+ tile_data, cx, cy, stride_bytes,
+ quantVals,
+ quantIdxY, quantIdxCb, quantIdxCr,
+ x / 64, y / 64) != 0)
+ {
+ return 1;
+ }
}
}
}
else
{
- for (index = 0; index < numTiles; index++)
+ if (flags & RFX_FLAGS_ALPHAV1)
+ {
+ for (index = 0; index < numTiles; index++)
+ {
+ x = tiles[index].x;
+ y = tiles[index].y;
+ cx = tiles[index].cx;
+ cy = tiles[index].cy;
+ quantIdxY = tiles[index].quant_y;
+ quantIdxCb = tiles[index].quant_cb;
+ quantIdxCr = tiles[index].quant_cr;
+ tile_data = buf + y * stride_bytes + x * (enc->bits_per_pixel / 8);
+ if (rfx_compose_message_tile_argb(enc, s,
+ tile_data, cx, cy, stride_bytes,
+ quantVals,
+ quantIdxY, quantIdxCb, quantIdxCr,
+ x / 64, y / 64) != 0)
+ {
+ return 1;
+ }
+ }
+ }
+ else
{
- x = tiles[index].x;
- y = tiles[index].y;
- cx = tiles[index].cx;
- cy = tiles[index].cy;
- quantIdxY = tiles[index].quant_y;
- quantIdxCb = tiles[index].quant_cb;
- quantIdxCr = tiles[index].quant_cr;
- tile_data = buf + y * stride_bytes + x * (enc->bits_per_pixel / 8);
- if (rfx_compose_message_tile_rgb(enc, s,
- tile_data, cx, cy, stride_bytes,
- quantVals,
- quantIdxY, quantIdxCb, quantIdxCr,
- x / 64, y / 64) != 0)
+ for (index = 0; index < numTiles; index++)
{
- return 1;
+ x = tiles[index].x;
+ y = tiles[index].y;
+ cx = tiles[index].cx;
+ cy = tiles[index].cy;
+ quantIdxY = tiles[index].quant_y;
+ quantIdxCb = tiles[index].quant_cb;
+ quantIdxCr = tiles[index].quant_cr;
+ tile_data = buf + y * stride_bytes + x * (enc->bits_per_pixel / 8);
+ if (rfx_compose_message_tile_rgb(enc, s,
+ tile_data, cx, cy, stride_bytes,
+ quantVals,
+ quantIdxY, quantIdxCb, quantIdxCr,
+ x / 64, y / 64) != 0)
+ {
+ return 1;
+ }
}
}
}
@@ -412,10 +564,10 @@ rfx_compose_message_frame_end(struct rfxencode* enc, STREAM* s)
/******************************************************************************/
int
rfx_compose_message_data(struct rfxencode* enc, STREAM* s,
- struct rfx_rect *regions, int num_regions,
+ const struct rfx_rect *regions, int num_regions,
char *buf, int width, int height, int stride_bytes,
- struct rfx_tile *tiles, int num_tiles,
- const int *quants, int num_quants)
+ const struct rfx_tile *tiles, int num_tiles,
+ const char *quants, int num_quants, int flags)
{
if (rfx_compose_message_frame_begin(enc, s) != 0)
{
@@ -426,7 +578,8 @@ rfx_compose_message_data(struct rfxencode* enc, STREAM* s,
return 1;
}
if (rfx_compose_message_tileset(enc, s, buf, width, height, stride_bytes,
- tiles, num_tiles, quants, num_quants) != 0)
+ tiles, num_tiles, quants, num_quants,
+ flags) != 0)
{
return 1;
}
diff --git a/src/rfxcompose.h b/src/rfxcompose.h
index aab4770..7d30233 100644
--- a/src/rfxcompose.h
+++ b/src/rfxcompose.h
@@ -1,7 +1,7 @@
/**
* RFX codec encoder
*
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -25,9 +25,9 @@ int
rfx_compose_message_header(struct rfxencode* enc, STREAM* s);
int
rfx_compose_message_data(struct rfxencode* enc, STREAM* s,
- struct rfx_rect *regions, int num_regions,
+ const struct rfx_rect *regions, int num_regions,
char *buf, int width, int height, int stride_bytes,
- struct rfx_tile *tiles, int num_tiles,
- const int *quants, int num_quants);
+ const struct rfx_tile *tiles, int num_tiles,
+ const char *quants, int num_quants, int flags);
#endif
diff --git a/src/rfxconstants.h b/src/rfxconstants.h
index 05cb18d..770fccb 100644
--- a/src/rfxconstants.h
+++ b/src/rfxconstants.h
@@ -39,6 +39,7 @@ enum _RLGR_MODE
#define WBT_FRAME_END 0xCCC5
#define WBT_REGION 0xCCC6
#define WBT_EXTENSION 0xCCC7
+#define WBT_EXTENSION_PLUS 0xDDD7
#define CBT_REGION 0xCAC1
#define CBT_TILESET 0xCAC2
#define CBT_TILE 0xCAC3
diff --git a/src/rfxencode.c b/src/rfxencode.c
index 4ad57f8..9bbf103 100644
--- a/src/rfxencode.c
+++ b/src/rfxencode.c
@@ -1,7 +1,7 @@
/**
* RFX codec encoder
*
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -28,103 +28,46 @@
#include "rfxconstants.h"
#include "rfxencode_tile.h"
-/******************************************************************************/
-static void
-cpuid(int func, int *eax, int *ebx, int *ecx, int *edx)
-{
- *eax = 0;
- *ebx = 0;
- *ecx = 0;
- *edx = 0;
-#ifdef __GNUC__
-#if defined(__i386__) || defined(__x86_64__)
- *eax = func;
- __asm volatile
- (
- "mov %%ebx, %%edi;"
- "cpuid;"
- "mov %%ebx, %%esi;"
- "mov %%edi, %%ebx;"
- :"+a" (*eax), "=S" (*ebx), "=c" (*ecx), "=d" (*edx)
- : :"edi"
- );
-#endif
+#ifdef RFX_USE_ACCEL_X86
+#include "x86/funcs_x86.h"
#endif
-}
-
-#if 0
-inline unsigned int get_cpu_feature_flags()
-{
- unsigned int features;
-
- __asm
- {
- // Save registers
- push eax
- push ebx
- push ecx
- push edx
-
- // Get the feature flags (eax=1) from edx
- mov eax, 1
- cpuid
- mov features, edx
-
- // Restore registers
- pop edx
- pop ecx
- pop ebx
- pop eax
- }
-
- return features;
-}
-
-#define cpuid(func,a,b,c,d)\
- asm {\
- mov eax, func\
- cpuid\
- mov a, eax\
- mov b, ebx\
- mov c, ecx\
- mov d, edx\
- }
-#endif
-
-// http://softpixel.com/~cwright/programming/simd/cpuid.php
-
-#define SSE4_1_FLAG 0x080000
-#define SSE4_2_FLAG 0x100000
-
-/*
-Function 0x80000001:
-bit (edx) feature
-22 AMD MMX Extensions
-30 3DNow!2
-31 3DNow!
-*/
-
-#if 0
-#define cpuid(_func, _ax, _bx, _cx, _dx) \
- __asm volatile ("cpuid": \
- "=a" (_ax), "=b" (_bx), "=c" (_cx), "=d" (_dx) : "a" (_func));
+#ifdef RFX_USE_ACCEL_AMD64
+#include "amd64/funcs_amd64.h"
#endif
/******************************************************************************/
-void *
-rfxcodec_encode_create(int width, int height, int format, int flags)
+int
+rfxcodec_encode_create_ex(int width, int height, int format, int flags,
+ void **handle)
{
struct rfxencode *enc;
- int ax, bx, cx, dx;
+ int ax;
+ int bx;
+ int cx;
+ int dx;
enc = (struct rfxencode *) malloc(sizeof(struct rfxencode));
if (enc == 0)
{
- return 0;
+ return 1;
}
memset(enc, 0, sizeof(struct rfxencode));
- cpuid(1, &ax, &bx, &cx, &dx);
+
+ enc->dwt_buffer = (sint16*)(((size_t)(enc->dwt_buffer_a)) & ~15);
+ enc->dwt_buffer1 = (sint16*)(((size_t)(enc->dwt_buffer1_a)) & ~15);
+ enc->dwt_buffer2 = (sint16*)(((size_t)(enc->dwt_buffer2_a)) & ~15);
+
+#if defined(RFX_USE_ACCEL_X86)
+ cpuid_x86(1, 0, &ax, &bx, &cx, &dx);
+#elif defined(RFX_USE_ACCEL_AMD64)
+ cpuid_amd64(1, 0, &ax, &bx, &cx, &dx);
+#else
+ ax = 0;
+ bx = 0;
+ cx = 0;
+ dx = 0;
+#endif
if (dx & (1 << 26)) /* SSE 2 */
{
printf("rfxcodec_encode_create: got sse2\n");
@@ -150,7 +93,16 @@ rfxcodec_encode_create(int width, int height, int format, int flags)
printf("rfxcodec_encode_create: got popcnt\n");
enc->got_popcnt = 1;
}
- cpuid(0x80000001, &ax, &bx, &cx, &dx);
+#if defined(RFX_USE_ACCEL_X86)
+ cpuid_x86(0x80000001, 0, &ax, &bx, &cx, &dx);
+#elif defined(RFX_USE_ACCEL_AMD64)
+ cpuid_amd64(0x80000001, 0, &ax, &bx, &cx, &dx);
+#else
+ ax = 0;
+ bx = 0;
+ cx = 0;
+ dx = 0;
+#endif
if (cx & (1 << 5)) /* lzcnt */
{
printf("rfxcodec_encode_create: got lzcnt\n");
@@ -169,7 +121,7 @@ rfxcodec_encode_create(int width, int height, int format, int flags)
{
enc->mode = RLGR1;
}
- switch (format)
+ switch (format)
{
case RFX_FORMAT_BGRA:
enc->bits_per_pixel = 32;
@@ -188,7 +140,7 @@ rfxcodec_encode_create(int width, int height, int format, int flags)
break;
default:
free(enc);
- return NULL;
+ return 2;
}
enc->format = format;
/* assign encoding functions */
@@ -196,29 +148,133 @@ rfxcodec_encode_create(int width, int height, int format, int flags)
{
if (enc->mode == RLGR3)
{
+ printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3\n");
enc->rfx_encode = rfx_encode_component_rlgr3; /* rfxencode_tile.c */
}
else
{
+ printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1\n");
enc->rfx_encode = rfx_encode_component_rlgr1; /* rfxencode_tile.c */
}
}
else
{
-#if defined(RFX_USE_ACCEL) && RFX_USE_ACCEL
- enc->rfx_encode = rfx_encode_component_x86_sse4; /* rfxencode_tile.c */
+#if defined(RFX_USE_ACCEL_X86)
+ if (enc->got_sse41)
+ {
+ if (enc->mode == RLGR3)
+ {
+ printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3_x86_sse41\n");
+ enc->rfx_encode = rfx_encode_component_rlgr3_x86_sse41; /* rfxencode_tile.c */
+ }
+ else
+ {
+ printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1_x86_sse41\n");
+ enc->rfx_encode = rfx_encode_component_rlgr1_x86_sse41; /* rfxencode_tile.c */
+ }
+ }
+ else if (enc->got_sse2)
+ {
+ if (enc->mode == RLGR3)
+ {
+ printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3_x86_sse2\n");
+ enc->rfx_encode = rfx_encode_component_rlgr3_x86_sse2; /* rfxencode_tile.c */
+ }
+ else
+ {
+ printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1_x86_sse2\n");
+ enc->rfx_encode = rfx_encode_component_rlgr1_x86_sse2; /* rfxencode_tile.c */
+ }
+ }
+ else
+ {
+ if (enc->mode == RLGR3)
+ {
+ printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3\n");
+ enc->rfx_encode = rfx_encode_component_rlgr3; /* rfxencode_tile.c */
+ }
+ else
+ {
+ printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1\n");
+ enc->rfx_encode = rfx_encode_component_rlgr1; /* rfxencode_tile.c */
+ }
+ }
+#elif defined(RFX_USE_ACCEL_AMD64)
+ if (enc->got_sse41)
+ {
+ if (enc->mode == RLGR3)
+ {
+ printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3_amd64_sse41\n");
+ enc->rfx_encode = rfx_encode_component_rlgr3_amd64_sse41; /* rfxencode_tile.c */
+ }
+ else
+ {
+ printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1_amd64_sse41\n");
+ enc->rfx_encode = rfx_encode_component_rlgr1_amd64_sse41; /* rfxencode_tile.c */
+ }
+ }
+ else if (enc->got_sse2)
+ {
+ if (enc->mode == RLGR3)
+ {
+ printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3_amd64_sse2\n");
+ enc->rfx_encode = rfx_encode_component_rlgr3_amd64_sse2; /* rfxencode_tile.c */
+ }
+ else
+ {
+ printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1_amd64_sse2\n");
+ enc->rfx_encode = rfx_encode_component_rlgr1_amd64_sse2; /* rfxencode_tile.c */
+ }
+ }
+ else
+ {
+ if (enc->mode == RLGR3)
+ {
+ printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3\n");
+ enc->rfx_encode = rfx_encode_component_rlgr3; /* rfxencode_tile.c */
+ }
+ else
+ {
+ printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1\n");
+ enc->rfx_encode = rfx_encode_component_rlgr1; /* rfxencode_tile.c */
+ }
+ }
#else
if (enc->mode == RLGR3)
{
+ printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr3\n");
enc->rfx_encode = rfx_encode_component_rlgr3; /* rfxencode_tile.c */
}
else
{
+ printf("rfxcodec_encode_create: rfx_encode set to rfx_encode_component_rlgr1\n");
enc->rfx_encode = rfx_encode_component_rlgr1; /* rfxencode_tile.c */
}
#endif
}
- return enc;
+ if (ax == 0)
+ {
+ }
+ if (bx == 0)
+ {
+ }
+ *handle = enc;
+ return 0;
+}
+
+/******************************************************************************/
+void *
+rfxcodec_encode_create(int width, int height, int format, int flags)
+{
+ int error;
+ void *handle;
+
+ error = rfxcodec_encode_create_ex(width, height, format, flags, &handle);
+ if (error == 0)
+ {
+ return handle;
+ }
+ return 0;
}
/******************************************************************************/
@@ -238,11 +294,11 @@ rfxcodec_encode_destroy(void * handle)
/******************************************************************************/
int
-rfxcodec_encode(void *handle, char *cdata, int *cdata_bytes,
- char *buf, int width, int height, int stride_bytes,
- struct rfx_rect *regions, int num_regions,
- struct rfx_tile *tiles, int num_tiles,
- const int *quants, int num_quants)
+rfxcodec_encode_ex(void *handle, char *cdata, int *cdata_bytes,
+ char *buf, int width, int height, int stride_bytes,
+ const struct rfx_rect *regions, int num_regions,
+ const struct rfx_tile *tiles, int num_tiles,
+ const char *quants, int num_quants, int flags)
{
struct rfxencode *enc;
STREAM s;
@@ -263,10 +319,25 @@ rfxcodec_encode(void *handle, char *cdata, int *cdata_bytes,
}
if (rfx_compose_message_data(enc, &s, regions, num_regions,
buf, width, height, stride_bytes,
- tiles, num_tiles, quants, num_quants) != 0)
+ tiles, num_tiles, quants, num_quants,
+ flags) != 0)
{
return 1;
}
*cdata_bytes = (int) (s.p - s.data);
return 0;
}
+
+/******************************************************************************/
+int
+rfxcodec_encode(void *handle, char *cdata, int *cdata_bytes,
+ char *buf, int width, int height, int stride_bytes,
+ const struct rfx_rect *regions, int num_regions,
+ const struct rfx_tile *tiles, int num_tiles,
+ const char *quants, int num_quants)
+{
+ return rfxcodec_encode_ex(handle, cdata, cdata_bytes, buf, width, height,
+ stride_bytes, regions, num_regions, tiles,
+ num_tiles, quants, num_quants, 0);
+}
+
diff --git a/src/rfxencode.h b/src/rfxencode.h
index 4db6a01..c9fc5d0 100644
--- a/src/rfxencode.h
+++ b/src/rfxencode.h
@@ -1,7 +1,7 @@
/**
* RFX codec encoder
*
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -21,8 +21,7 @@
struct rfxencode;
-typedef int (*rfx_encode_proc)(struct rfxencode *enc,
- const int *quantization_values,
+typedef int (*rfx_encode_proc)(struct rfxencode *enc, const char *qtable,
uint8 *data, uint8 *buffer,
int buffer_size, int *size);
@@ -39,13 +38,18 @@ struct rfxencode
int format;
int pad0[7];
+ uint8 a_buffer[4096];
uint8 y_r_buffer[4096];
- uint8 cb_g_buffer[4096];
- uint8 cr_b_buffer[4096];
-
- sint16 dwt_buffer[4096];
- sint16 dwt_buffer1[4096];
-
+ uint8 u_g_buffer[4096];
+ uint8 v_b_buffer[4096];
+ uint8 pad1[16];
+ sint16 dwt_buffer_a[4096];
+ sint16 dwt_buffer1_a[4096];
+ sint16 dwt_buffer2_a[4096];
+ uint8 pad2[16];
+ sint16* dwt_buffer;
+ sint16* dwt_buffer1;
+ sint16* dwt_buffer2;
rfx_encode_proc rfx_encode;
int got_sse2;
@@ -56,7 +60,6 @@ struct rfxencode
int got_popcnt;
int got_lzcnt;
int got_neon;
-
};
#endif
diff --git a/src/rfxencode_alpha.c b/src/rfxencode_alpha.c
new file mode 100644
index 0000000..58d8e10
--- /dev/null
+++ b/src/rfxencode_alpha.c
@@ -0,0 +1,279 @@
+/**
+ * librfxcodec: A Remote Desktop Protocol client.
+ * RemoteFX Codec Library
+ *
+ * Copyright 2015 Jay Sorg <jay.sorg@gmail.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <rfxcodec_encode.h>
+
+#include "rfxcommon.h"
+#include "rfxencode.h"
+#include "rfxconstants.h"
+#include "rfxencode_tile.h"
+
+#define LLOG_LEVEL 1
+#define LLOGLN(_level, _args) \
+ do { if (_level < LLOG_LEVEL) { printf _args ; printf("\n"); } } while (0)
+
+#if 1
+/*****************************************************************************/
+static int
+fdelta(char *in_plane, char *out_plane, int cx, int cy)
+{
+ char delta;
+ char *src8;
+ char *dst8;
+ int index;
+ int jndex;
+
+ memcpy(out_plane, in_plane, cx);
+ src8 = in_plane;
+ dst8 = out_plane;
+ for (jndex = 1; jndex < cy; jndex++)
+ {
+ for (index = 0; index < cx; index++)
+ {
+ delta = src8[cx] - src8[0];
+ if (delta & 0x80)
+ {
+ delta = (((~delta) + 1) << 1) - 1;
+ }
+ else
+ {
+ delta = delta << 1;
+ }
+ dst8[cx] = delta;
+ src8++;
+ dst8++;
+ }
+ }
+ return 0;
+}
+#endif
+
+#if 0
+/*****************************************************************************/
+#define DELTA_ONE \
+do { \
+ delta = src8[cx] - src8[0]; \
+ is_neg = (delta >> 7) & 1; \
+ dst8[cx] = (((delta ^ -is_neg) + is_neg) << 1) - is_neg; \
+ src8++; \
+ dst8++; \
+} while (0)
+
+/*****************************************************************************/
+static int
+fdelta(char *in_plane, char *out_plane, int cx, int cy)
+{
+ char delta;
+ char is_neg;
+ char *src8;
+ char *dst8;
+ char *src8_end;
+
+ memcpy(out_plane, in_plane, cx);
+ src8 = in_plane;
+ dst8 = out_plane;
+ src8_end = src8 + (cx * cy - cx);
+ while (src8 + 8 <= src8_end)
+ {
+ DELTA_ONE;
+ DELTA_ONE;
+ DELTA_ONE;
+ DELTA_ONE;
+ DELTA_ONE;
+ DELTA_ONE;
+ DELTA_ONE;
+ DELTA_ONE;
+ }
+ while (src8 < src8_end)
+ {
+ DELTA_ONE;
+ }
+ return 0;
+}
+#endif
+
+/*****************************************************************************/
+static int
+fout(int collen, int replen, char *colptr, STREAM *s)
+{
+ int code;
+ int lcollen;
+ int lreplen;
+ int cont;
+
+ LLOGLN(10, ("fout: collen %d replen %d", collen, replen));
+ cont = collen > 13;
+ while (cont)
+ {
+ lcollen = collen;
+ if (lcollen > 15)
+ {
+ lcollen = 15;
+ }
+ code = lcollen << 4;
+ stream_write_uint8(s, code);
+ memcpy(s->p, colptr, lcollen);
+ s->p += lcollen;
+ colptr += lcollen;
+ collen -= lcollen;
+ cont = collen > 13;
+ }
+ cont = (collen > 0) || (replen > 0);
+ while (cont)
+ {
+ lreplen = replen;
+ if ((collen == 0) && (lreplen > 15))
+ {
+ /* big run */
+ if (lreplen > 47)
+ {
+ lreplen = 47;
+ }
+ LLOGLN(10, ("fout: big run lreplen %d", lreplen));
+ replen -= lreplen;
+ code = ((lreplen & 0xF) << 4) | ((lreplen & 0xF0) >> 4);
+ stream_write_uint8(s, code);
+ colptr += lreplen;
+ }
+ else
+ {
+ if (lreplen > 15)
+ {
+ lreplen = 15;
+ }
+ replen -= lreplen;
+ if (lreplen < 3)
+ {
+ collen += lreplen;
+ lreplen = 0;
+ }
+ code = (collen << 4) | lreplen;
+ stream_write_uint8(s, code);
+ memcpy(s->p, colptr, collen);
+ s->p += collen;
+ colptr += collen + lreplen;
+ collen = 0;
+ }
+ cont = replen > 0;
+ }
+ return 0;
+}
+
+/*****************************************************************************/
+static int
+fpack(char *plane, int cx, int cy, STREAM *s)
+{
+ char *ptr8;
+ char *colptr;
+ char *lend;
+ uint8 *holdp;
+ int jndex;
+ int collen;
+ int replen;
+
+ LLOGLN(10, ("fpack:"));
+ holdp = s->p;
+ for (jndex = 0; jndex < cy; jndex++)
+ {
+ LLOGLN(10, ("line start line %d cx %d cy %d", jndex, cx, cy));
+ ptr8 = (char *) (plane + jndex * cx);
+ lend = ptr8 + (cx - 1);
+ colptr = ptr8;
+ if (colptr[0] == 0)
+ {
+ collen = 0;
+ replen = 1;
+ }
+ else
+ {
+ collen = 1;
+ replen = 0;
+ }
+ while (ptr8 < lend)
+ {
+ if (ptr8[0] == ptr8[1])
+ {
+ replen++;
+ }
+ else
+ {
+ if (replen > 0)
+ {
+ if (replen < 3)
+ {
+ collen += replen + 1;
+ replen = 0;
+ }
+ else
+ {
+ fout(collen, replen, colptr, s);
+ colptr = ptr8 + 1;
+ replen = 0;
+ collen = 1;
+ }
+ }
+ else
+ {
+ collen++;
+ }
+ }
+ ptr8++;
+ }
+ /* end of line */
+ fout(collen, replen, colptr, s);
+ }
+ return (int) (s->p - holdp);
+}
+
+/*****************************************************************************/
+int
+rfx_encode_plane(struct rfxencode *enc, uint8 *plane, int cx, int cy,
+ STREAM *s)
+{
+ char *org_plane;
+ char *delta_plane;
+ int bytes;
+ uint8 *holdp;
+
+ org_plane = (char *) plane;
+ delta_plane = (char *) (enc->dwt_buffer1);
+ fdelta(org_plane, delta_plane, cx, cy);
+ holdp = s->p;
+ stream_write_uint8(s, 0x10); /* flags, RLE */
+ bytes = fpack(delta_plane, cx, cy, s);
+ if (bytes > cx * cy)
+ {
+ LLOGLN(10, ("rfx_encode_plane: too big bytes %d", bytes));
+ s->p = holdp;
+ stream_write_uint8(s, 0); /* flags */
+ memcpy(s->p, plane, cx * cy);
+ s->p += cx * cy;
+ stream_write_uint8(s, 0); /* pad if not RLE */
+ bytes = cx * cy + 2;
+ }
+ else
+ {
+ LLOGLN(10, ("rfx_encode_plane: ok bytes %d", bytes));
+ }
+ return bytes;
+}
diff --git a/src/rfxencode_alpha.h b/src/rfxencode_alpha.h
new file mode 100644
index 0000000..3f01218
--- /dev/null
+++ b/src/rfxencode_alpha.h
@@ -0,0 +1,28 @@
+/**
+ * librfxcodec: A Remote Desktop Protocol client.
+ * RemoteFX Codec Library
+ *
+ * Copyright 2015 Jay Sorg <jay.sorg@gmail.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RFXCODEC_ENCODE_ALPHA_H
+#define __RFXCODEC_ENCODE_ALPHA_H
+
+int
+rfx_encode_plane(struct rfxencode *enc, uint8 *plane, int cx, int cy,
+ STREAM *s);
+
+#endif
+
diff --git a/src/rfxencode_dwt.c b/src/rfxencode_dwt.c
index b68b765..36c8e93 100644
--- a/src/rfxencode_dwt.c
+++ b/src/rfxencode_dwt.c
@@ -3,7 +3,7 @@
* RemoteFX Codec Library - DWT
*
* Copyright 2011 Vic Lee
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -151,6 +151,7 @@ rfx_dwt_2d_encode_block8(uint8 *in_buffer,
{
uint8 *src;
sint16 *l, *h;
+ sint16 s1, s2, s3;
int total_width;
int x, y;
int n;
@@ -166,8 +167,12 @@ rfx_dwt_2d_encode_block8(uint8 *in_buffer,
l = dwt + x;
h = l + subband_width * total_width;
src = in_buffer + x;
- *h = ((src[total_width] - 128) - (((src[0] - 128) + (src[2 * total_width] - 128)) >> 1)) >> 1;
- *l = (src[0] - 128) + *h;
+ s1 = (src[total_width] - 128) << DWT_FACTOR;
+ s2 = (src[0] - 128) << DWT_FACTOR;
+ s3 = (src[2 * total_width] - 128) << DWT_FACTOR;
+ *h = (s1 - ((s2 + s3) >> 1)) >> 1;
+ s1 = (src[0] - 128) << DWT_FACTOR;
+ *l = s1 + *h;
/* loop */
for (n = 1; n < subband_width - 1; n++)
@@ -176,8 +181,12 @@ rfx_dwt_2d_encode_block8(uint8 *in_buffer,
l = dwt + n * total_width + x;
h = l + subband_width * total_width;
src = in_buffer + y * total_width + x;
- *h = ((src[total_width] - 128) - (((src[0] - 128) + (src[2 * total_width] - 128)) >> 1)) >> 1;
- *l = (src[0] - 128) + ((*(h - total_width) + *h) >> 1);
+ s1 = (src[total_width] - 128) << DWT_FACTOR;
+ s2 = (src[0] - 128) << DWT_FACTOR;
+ s3 = (src[2 * total_width] - 128) << DWT_FACTOR;
+ *h = (s1 - ((s2 + s3) >> 1)) >> 1;
+ s1 = (src[0] - 128) << DWT_FACTOR;
+ *l = s1 + ((*(h - total_width) + *h) >> 1);
}
/* post */
@@ -186,8 +195,12 @@ rfx_dwt_2d_encode_block8(uint8 *in_buffer,
l = dwt + n * total_width + x;
h = l + subband_width * total_width;
src = in_buffer + y * total_width + x;
- *h = ((src[total_width] - 128) - (((src[0] - 128) + (src[0] - 128)) >> 1)) >> 1;
- *l = (src[0] - 128) + ((*(h - total_width) + *h) >> 1);
+ s1 = (src[total_width] - 128) << DWT_FACTOR;
+ s2 = (src[0] - 128) << DWT_FACTOR;
+ s3 = (src[0] - 128) << DWT_FACTOR;
+ *h = (s1 - ((s2 + s3) >> 1)) >> 1;
+ s1 = (src[0] - 128) << DWT_FACTOR;
+ *l = s1 + ((*(h - total_width) + *h) >> 1);
}
diff --git a/src/rfxencode_dwt.h b/src/rfxencode_dwt.h
index 248edc1..36a62ed 100644
--- a/src/rfxencode_dwt.h
+++ b/src/rfxencode_dwt.h
@@ -1,7 +1,7 @@
/**
* RFX codec encoder
*
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/src/rfxencode_quantization.c b/src/rfxencode_quantization.c
index 9c65b40..6e3a577 100644
--- a/src/rfxencode_quantization.c
+++ b/src/rfxencode_quantization.c
@@ -3,7 +3,7 @@
* RemoteFX Codec Library - Quantization
*
* Copyright 2011 Vic Lee
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -88,7 +88,7 @@ rfx_quantization_encode_block(sint16* buffer, int buffer_size, uint32 factor)
}
#endif
-#if 1
+#if 0
/******************************************************************************/
static int
rfx_quantization_encode_block(sint16* buffer, int buffer_size, uint32 factor)
@@ -110,20 +110,54 @@ rfx_quantization_encode_block(sint16* buffer, int buffer_size, uint32 factor)
}
#endif
+#if 1
+/******************************************************************************/
+static int
+rfx_quantization_encode_block(sint16* buffer, int buffer_size, uint32 factor)
+{
+ sint16* dst;
+ sint16 half;
+
+ factor += DWT_FACTOR;
+ if (factor == 0)
+ {
+ return 1;
+ }
+ half = (1 << (factor - 1));
+ for (dst = buffer; buffer_size > 0; dst++, buffer_size--)
+ {
+ *dst = (*dst + half) >> factor;
+ }
+ return 0;
+}
+#endif
+
/******************************************************************************/
int
-rfx_quantization_encode(sint16* buffer, const int* quantization_values)
+rfx_quantization_encode(sint16* buffer, const char* qtable)
{
- rfx_quantization_encode_block(buffer, 1024, quantization_values[8] - 6); /* HL1 */
- rfx_quantization_encode_block(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
- rfx_quantization_encode_block(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
- rfx_quantization_encode_block(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
- rfx_quantization_encode_block(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
- rfx_quantization_encode_block(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
- rfx_quantization_encode_block(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
- rfx_quantization_encode_block(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
- rfx_quantization_encode_block(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
- rfx_quantization_encode_block(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
+ uint32 factor;
+
+ factor = ((qtable[4] >> 0) & 0xf) - 6;
+ rfx_quantization_encode_block(buffer, 1024, factor); /* HL1 */
+ factor = ((qtable[3] >> 4) & 0xf) - 6;
+ rfx_quantization_encode_block(buffer + 1024, 1024, factor); /* LH1 */
+ factor = ((qtable[4] >> 4) & 0xf) - 6;
+ rfx_quantization_encode_block(buffer + 2048, 1024, factor); /* HH1 */
+ factor = ((qtable[2] >> 4) & 0xf) - 6;
+ rfx_quantization_encode_block(buffer + 3072, 256, factor); /* HL2 */
+ factor = ((qtable[2] >> 0) & 0xf) - 6;
+ rfx_quantization_encode_block(buffer + 3328, 256, factor); /* LH2 */
+ factor = ((qtable[3] >> 0) & 0xf) - 6;
+ rfx_quantization_encode_block(buffer + 3584, 256, factor); /* HH2 */
+ factor = ((qtable[1] >> 0) & 0xf) - 6;
+ rfx_quantization_encode_block(buffer + 3840, 64, factor); /* HL3 */
+ factor = ((qtable[0] >> 4) & 0xf) - 6;
+ rfx_quantization_encode_block(buffer + 3904, 64, factor); /* LH3 */
+ factor = ((qtable[1] >> 4) & 0xf) - 6;
+ rfx_quantization_encode_block(buffer + 3968, 64, factor); /* HH3 */
+ factor = ((qtable[0] >> 0) & 0xf) - 6;
+ rfx_quantization_encode_block(buffer + 4032, 64, factor); /* LL3 */
return 0;
}
diff --git a/src/rfxencode_quantization.h b/src/rfxencode_quantization.h
index a0cd802..d246889 100644
--- a/src/rfxencode_quantization.h
+++ b/src/rfxencode_quantization.h
@@ -23,6 +23,6 @@
#include "rfxcommon.h"
int
-rfx_quantization_encode(sint16* buffer, const int* quantization_values);
+rfx_quantization_encode(sint16 *buffer, const char *quantization_values);
#endif /* __RFX_QUANTIZATION_H */
diff --git a/src/rfxencode_rlgr1.c b/src/rfxencode_rlgr1.c
index e4b9867..638b535 100644
--- a/src/rfxencode_rlgr1.c
+++ b/src/rfxencode_rlgr1.c
@@ -124,7 +124,7 @@ do { \
} while (0)
int
-rfx_rlgr1_encode(const sint16* data, int data_size, uint8* buffer, int buffer_size)
+rfx_rlgr1_encode(const sint16* data, uint8* buffer, int buffer_size)
{
int k;
int kp;
@@ -137,6 +137,7 @@ rfx_rlgr1_encode(const sint16* data, int data_size, uint8* buffer, int buffer_si
int sign;
int processed_size;
int lmag;
+ int data_size;
RFX_BITSTREAM bs;
@@ -150,6 +151,7 @@ rfx_rlgr1_encode(const sint16* data, int data_size, uint8* buffer, int buffer_si
krp = 1 << LSGR;
/* process all the input coefficients */
+ data_size = 4096;
while (data_size > 0)
{
if (k)
diff --git a/src/rfxencode_rlgr1.h b/src/rfxencode_rlgr1.h
index a08e637..f941e06 100644
--- a/src/rfxencode_rlgr1.h
+++ b/src/rfxencode_rlgr1.h
@@ -23,6 +23,6 @@
#include "rfxcommon.h"
int
-rfx_rlgr1_encode(const sint16* data, int data_size, uint8* buffer, int buffer_size);
+rfx_rlgr1_encode(const sint16* data, uint8* buffer, int buffer_size);
#endif /* __RFX_RLGR_H */
diff --git a/src/rfxencode_rlgr3.c b/src/rfxencode_rlgr3.c
index 3b1666d..809767d 100644
--- a/src/rfxencode_rlgr3.c
+++ b/src/rfxencode_rlgr3.c
@@ -124,7 +124,7 @@ do { \
} while (0)
int
-rfx_rlgr3_encode(const sint16* data, int data_size, uint8* buffer, int buffer_size)
+rfx_rlgr3_encode(const sint16* data, uint8* buffer, int buffer_size)
{
int k;
int kp;
@@ -137,6 +137,7 @@ rfx_rlgr3_encode(const sint16* data, int data_size, uint8* buffer, int buffer_si
int sign;
int processed_size;
int lmag;
+ int data_size;
RFX_BITSTREAM bs;
@@ -153,6 +154,7 @@ rfx_rlgr3_encode(const sint16* data, int data_size, uint8* buffer, int buffer_si
krp = 1 << LSGR;
/* process all the input coefficients */
+ data_size = 4096;
while (data_size > 0)
{
if (k)
diff --git a/src/rfxencode_rlgr3.h b/src/rfxencode_rlgr3.h
index 1efdc4c..2743e39 100644
--- a/src/rfxencode_rlgr3.h
+++ b/src/rfxencode_rlgr3.h
@@ -23,6 +23,6 @@
#include "rfxcommon.h"
int
-rfx_rlgr3_encode(const sint16* data, int data_size, uint8* buffer, int buffer_size);
+rfx_rlgr3_encode(const sint16* data, uint8* buffer, int buffer_size);
#endif /* __RFX_RLGR_H */
diff --git a/src/rfxencode_tile.c b/src/rfxencode_tile.c
index 409121c..e78b746 100644
--- a/src/rfxencode_tile.c
+++ b/src/rfxencode_tile.c
@@ -3,7 +3,7 @@
* RemoteFX Codec Library - Encode
*
* Copyright 2011 Vic Lee
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -33,6 +33,15 @@
#include "rfxencode_differential.h"
#include "rfxencode_rlgr1.h"
#include "rfxencode_rlgr3.h"
+#include "rfxencode_alpha.h"
+
+#ifdef RFX_USE_ACCEL_X86
+#include "x86/funcs_x86.h"
+#endif
+
+#ifdef RFX_USE_ACCEL_AMD64
+#include "amd64/funcs_amd64.h"
+#endif
#define LLOG_LEVEL 1
#define LLOGLN(_level, _args) \
@@ -50,69 +59,337 @@ rfx_encode_format_rgb(char *rgb_data, int width, int height,
uint8 r;
uint8 g;
uint8 b;
+ uint8 *lr_buf;
+ uint8 *lg_buf;
+ uint8 *lb_buf;
+ LLOGLN(10, ("rfx_encode_format_rgb: pixel_format %d", pixel_format));
+ b = 0;
+ g = 0;
+ r = 0;
switch (pixel_format)
{
case RFX_FORMAT_BGRA:
for (y = 0; y < height; y++)
{
src = (uint8*) (rgb_data + y * stride_bytes);
+ lr_buf = r_buf + y * 64;
+ lg_buf = g_buf + y * 64;
+ lb_buf = b_buf + y * 64;
for (x = 0; x < width; x++)
{
b = *src++;
- *b_buf++ = b;
+ *lb_buf++ = b;
g = *src++;
- *g_buf++ = g;
+ *lg_buf++ = g;
r = *src++;
- *r_buf++ = r;
+ *lr_buf++ = r;
src++;
}
+ while (x < 64)
+ {
+ *lr_buf++ = r;
+ *lg_buf++ = g;
+ *lb_buf++ = r;
+ x++;
+ }
+ }
+ while (y < 64)
+ {
+ lr_buf = r_buf + y * 64;
+ lg_buf = g_buf + y * 64;
+ lb_buf = b_buf + y * 64;
+ memcpy(lr_buf, lr_buf - 64, 64);
+ memcpy(lg_buf, lg_buf - 64, 64);
+ memcpy(lb_buf, lb_buf - 64, 64);
+ y++;
}
break;
case RFX_FORMAT_RGBA:
for (y = 0; y < height; y++)
{
src = (uint8*) (rgb_data + y * stride_bytes);
+ lr_buf = r_buf + y * 64;
+ lg_buf = g_buf + y * 64;
+ lb_buf = b_buf + y * 64;
for (x = 0; x < width; x++)
{
r = *src++;
- *r_buf++ = r;
+ *lr_buf++ = r;
g = *src++;
- *g_buf++ = g;
+ *lg_buf++ = g;
b = *src++;
- *b_buf++ = b;
+ *lb_buf++ = b;
src++;
}
+ while (x < 64)
+ {
+ *lr_buf++ = r;
+ *lg_buf++ = g;
+ *lb_buf++ = b;
+ x++;
+ }
+ }
+ while (y < 64)
+ {
+ lr_buf = r_buf + y * 64;
+ lg_buf = g_buf + y * 64;
+ lb_buf = b_buf + y * 64;
+ memcpy(lr_buf, lr_buf - 64, 64);
+ memcpy(lg_buf, lg_buf - 64, 64);
+ memcpy(lb_buf, lb_buf - 64, 64);
+ y++;
}
break;
case RFX_FORMAT_BGR:
for (y = 0; y < height; y++)
{
src = (uint8*) (rgb_data + y * stride_bytes);
+ lr_buf = r_buf + y * 64;
+ lg_buf = g_buf + y * 64;
+ lb_buf = b_buf + y * 64;
for (x = 0; x < width; x++)
{
b = *src++;
- *b_buf++ = b;
+ *lb_buf++ = b;
g = *src++;
- *g_buf++ = g;
+ *lg_buf++ = g;
r = *src++;
- *r_buf++ = r;
+ *lr_buf++ = r;
+ }
+ while (x < 64)
+ {
+ *lr_buf++ = r;
+ *lg_buf++ = g;
+ *lb_buf++ = b;
+ x++;
}
}
+ while (y < 64)
+ {
+ lr_buf = r_buf + y * 64;
+ lg_buf = g_buf + y * 64;
+ lb_buf = b_buf + y * 64;
+ memcpy(lr_buf, lr_buf - 64, 64);
+ memcpy(lg_buf, lg_buf - 64, 64);
+ memcpy(lb_buf, lb_buf - 64, 64);
+ y++;
+ }
break;
case RFX_FORMAT_RGB:
for (y = 0; y < height; y++)
{
src = (uint8*) (rgb_data + y * stride_bytes);
+ lr_buf = r_buf + y * 64;
+ lg_buf = g_buf + y * 64;
+ lb_buf = b_buf + y * 64;
+ for (x = 0; x < width; x++)
+ {
+ r = *src++;
+ *lr_buf++ = r;
+ g = *src++;
+ *lg_buf++ = g;
+ b = *src++;
+ *lb_buf++ = b;
+ }
+ while (x < 64)
+ {
+ *lr_buf++ = r;
+ *lg_buf++ = g;
+ *lb_buf++ = b;
+ x++;
+ }
+ }
+ while (y < 64)
+ {
+ lr_buf = r_buf + y * 64;
+ lg_buf = g_buf + y * 64;
+ lb_buf = b_buf + y * 64;
+ memcpy(lr_buf, lr_buf - 64, 64);
+ memcpy(lg_buf, lg_buf - 64, 64);
+ memcpy(lb_buf, lb_buf - 64, 64);
+ y++;
+ }
+ break;
+ }
+ return 0;
+}
+
+/******************************************************************************/
+static int
+rfx_encode_format_argb(char *argb_data, int width, int height,
+ int stride_bytes, int pixel_format,
+ uint8 *a_buf, uint8 *r_buf, uint8 *g_buf, uint8 *b_buf)
+{
+ int x;
+ int y;
+ const uint8 *src;
+ uint8 a;
+ uint8 r;
+ uint8 g;
+ uint8 b;
+ uint8 *la_buf;
+ uint8 *lr_buf;
+ uint8 *lg_buf;
+ uint8 *lb_buf;
+
+ LLOGLN(10, ("rfx_encode_format_argb: pixel_format %d", pixel_format));
+ b = 0;
+ g = 0;
+ r = 0;
+ a = 0;
+ switch (pixel_format)
+ {
+ case RFX_FORMAT_BGRA:
+ for (y = 0; y < height; y++)
+ {
+ src = (uint8*) (argb_data + y * stride_bytes);
+ la_buf = a_buf + y * 64;
+ lr_buf = r_buf + y * 64;
+ lg_buf = g_buf + y * 64;
+ lb_buf = b_buf + y * 64;
+ for (x = 0; x < width; x++)
+ {
+ b = *src++;
+ *lb_buf++ = b;
+ g = *src++;
+ *lg_buf++ = g;
+ r = *src++;
+ *lr_buf++ = r;
+ a = *src++;
+ *la_buf++ = a;
+ }
+ while (x < 64)
+ {
+ *la_buf++ = a;
+ *lr_buf++ = r;
+ *lg_buf++ = g;
+ *lb_buf++ = r;
+ x++;
+ }
+ }
+ while (y < 64)
+ {
+ la_buf = a_buf + y * 64;
+ lr_buf = r_buf + y * 64;
+ lg_buf = g_buf + y * 64;
+ lb_buf = b_buf + y * 64;
+ memcpy(la_buf, la_buf - 64, 64);
+ memcpy(lr_buf, lr_buf - 64, 64);
+ memcpy(lg_buf, lg_buf - 64, 64);
+ memcpy(lb_buf, lb_buf - 64, 64);
+ y++;
+ }
+ break;
+ case RFX_FORMAT_RGBA:
+ for (y = 0; y < height; y++)
+ {
+ src = (uint8*) (argb_data + y * stride_bytes);
+ la_buf = a_buf + y * 64;
+ lr_buf = r_buf + y * 64;
+ lg_buf = g_buf + y * 64;
+ lb_buf = b_buf + y * 64;
+ for (x = 0; x < width; x++)
+ {
+ r = *src++;
+ *lr_buf++ = r;
+ g = *src++;
+ *lg_buf++ = g;
+ b = *src++;
+ *lb_buf++ = b;
+ a = *src++;
+ *la_buf++ = a;
+ }
+ while (x < 64)
+ {
+ *la_buf++ = a;
+ *lr_buf++ = r;
+ *lg_buf++ = g;
+ *lb_buf++ = b;
+ x++;
+ }
+ }
+ while (y < 64)
+ {
+ la_buf = a_buf + y * 64;
+ lr_buf = r_buf + y * 64;
+ lg_buf = g_buf + y * 64;
+ lb_buf = b_buf + y * 64;
+ memcpy(la_buf, la_buf - 64, 64);
+ memcpy(lr_buf, lr_buf - 64, 64);
+ memcpy(lg_buf, lg_buf - 64, 64);
+ memcpy(lb_buf, lb_buf - 64, 64);
+ y++;
+ }
+ break;
+ case RFX_FORMAT_BGR:
+ for (y = 0; y < height; y++)
+ {
+ src = (uint8*) (argb_data + y * stride_bytes);
+ lr_buf = r_buf + y * 64;
+ lg_buf = g_buf + y * 64;
+ lb_buf = b_buf + y * 64;
for (x = 0; x < width; x++)
{
+ b = *src++;
+ *lb_buf++ = b;
+ g = *src++;
+ *lg_buf++ = g;
r = *src++;
- *r_buf++ = r;
+ *lr_buf++ = r;
+ }
+ while (x < 64)
+ {
+ *lr_buf++ = r;
+ *lg_buf++ = g;
+ *lb_buf++ = b;
+ x++;
+ }
+ }
+ while (y < 64)
+ {
+ lr_buf = r_buf + y * 64;
+ lg_buf = g_buf + y * 64;
+ lb_buf = b_buf + y * 64;
+ memcpy(lr_buf, lr_buf - 64, 64);
+ memcpy(lg_buf, lg_buf - 64, 64);
+ memcpy(lb_buf, lb_buf - 64, 64);
+ y++;
+ }
+ break;
+ case RFX_FORMAT_RGB:
+ for (y = 0; y < height; y++)
+ {
+ src = (uint8*) (argb_data + y * stride_bytes);
+ lr_buf = r_buf + y * 64;
+ lg_buf = g_buf + y * 64;
+ lb_buf = b_buf + y * 64;
+ for (x = 0; x < width; x++)
+ {
+ r = *src++;
+ *lr_buf++ = r;
g = *src++;
- *g_buf++ = g;
+ *lg_buf++ = g;
b = *src++;
- *b_buf++ = b;
+ *lb_buf++ = b;
}
+ while (x < 64)
+ {
+ *lr_buf++ = r;
+ *lg_buf++ = g;
+ *lb_buf++ = b;
+ x++;
+ }
+ }
+ while (y < 64)
+ {
+ lr_buf = r_buf + y * 64;
+ lg_buf = g_buf + y * 64;
+ lb_buf = b_buf + y * 64;
+ memcpy(lr_buf, lr_buf - 64, 64);
+ memcpy(lg_buf, lg_buf - 64, 64);
+ memcpy(lb_buf, lb_buf - 64, 64);
+ y++;
}
break;
}
@@ -131,25 +408,25 @@ rfx_encode_format_rgb(char *rgb_data, int width, int height,
-11071 -21736 32807
32756 -27429 -5327 */
static int
-rfx_encode_rgb_to_ycbcr(uint8 *y_r_buf, uint8 *cb_g_buf, uint8 *cr_b_buf)
+rfx_encode_rgb_to_yuv(uint8 *y_r_buf, uint8 *u_g_buf, uint8 *v_b_buf)
{
int i;
sint32 r, g, b;
- sint32 y, cb, cr;
+ sint32 y, u, v;
for (i = 0; i < 4096; i++)
{
r = y_r_buf[i];
- g = cb_g_buf[i];
- b = cr_b_buf[i];
+ g = u_g_buf[i];
+ b = v_b_buf[i];
- y = (r * 19595 + g * 38470 + b * 7471) >> 16;
- cb = (r * -11071 + g * -21736 + b * 32807) >> 16;
- cr = (r * 32756 + g * -27429 + b * -5327) >> 16;
+ y = (r * 19595 + g * 38470 + b * 7471) >> 16;
+ u = (r * -11071 + g * -21736 + b * 32807) >> 16;
+ v = (r * 32756 + g * -27429 + b * -5327) >> 16;
y_r_buf[i] = MINMAX(y, 0, 255);
- cb_g_buf[i] = MINMAX(cb + 128, 0, 255);
- cr_b_buf[i] = MINMAX(cr + 128, 0, 255);
+ u_g_buf[i] = MINMAX(u + 128, 0, 255);
+ v_b_buf[i] = MINMAX(v + 128, 0, 255);
}
return 0;
@@ -157,14 +434,15 @@ rfx_encode_rgb_to_ycbcr(uint8 *y_r_buf, uint8 *cb_g_buf, uint8 *cr_b_buf)
/******************************************************************************/
int
-rfx_encode_component_rlgr1(struct rfxencode *enc, const int *quantization_values,
+rfx_encode_component_rlgr1(struct rfxencode *enc, const char *qtable,
uint8 *data, uint8 *buffer, int buffer_size, int *size)
{
+ LLOGLN(10, ("rfx_encode_component_rlgr1:"));
if (rfx_dwt_2d_encode(data, enc->dwt_buffer1, enc->dwt_buffer) != 0)
{
return 1;
}
- if (rfx_quantization_encode(enc->dwt_buffer1, quantization_values) != 0)
+ if (rfx_quantization_encode(enc->dwt_buffer1, qtable) != 0)
{
return 1;
}
@@ -172,47 +450,220 @@ rfx_encode_component_rlgr1(struct rfxencode *enc, const int *quantization_values
{
return 1;
}
- *size = rfx_rlgr1_encode(enc->dwt_buffer1, 4096, buffer, buffer_size);
+ *size = rfx_rlgr1_encode(enc->dwt_buffer1, buffer, buffer_size);
return 0;
}
/******************************************************************************/
int
-rfx_encode_component_rlgr3(struct rfxencode *enc, const int *quantization_values,
+rfx_encode_component_rlgr3(struct rfxencode *enc, const char *qtable,
uint8 *data, uint8 *buffer, int buffer_size, int *size)
{
+ LLOGLN(10, ("rfx_encode_component_rlgr3:"));
if (rfx_dwt_2d_encode(data, enc->dwt_buffer1, enc->dwt_buffer) != 0)
{
return 1;
}
- if (rfx_quantization_encode(enc->dwt_buffer1, quantization_values) != 0)
+ if (rfx_quantization_encode(enc->dwt_buffer1, qtable) != 0)
+ {
+ return 1;
+ }
+ if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0)
+ {
+ return 1;
+ }
+ *size = rfx_rlgr3_encode(enc->dwt_buffer1, buffer, buffer_size);
+ return 0;
+}
+
+/******************************************************************************/
+int
+rfx_encode_component_rlgr1_x86_sse2(struct rfxencode *enc, const char *qtable,
+ uint8 *data,
+ uint8 *buffer, int buffer_size, int *size)
+{
+ LLOGLN(10, ("rfx_encode_component_rlgr1_x86_sse2:"));
+#if defined(RFX_USE_ACCEL_X86)
+ if (rfxcodec_encode_dwt_shift_x86_sse2(qtable, data, enc->dwt_buffer1,
+ enc->dwt_buffer) != 0)
+ {
+ return 1;
+ }
+ //*size = rfxcodec_encode_diff_rlgr1_x86_sse2(enc->dwt_buffer1,
+ // buffer, buffer_size);
+ if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0)
+ {
+ return 1;
+ }
+ *size = rfx_rlgr1_encode(enc->dwt_buffer1, buffer, buffer_size);
+#endif
+ return 0;
+}
+
+/******************************************************************************/
+int
+rfx_encode_component_rlgr3_x86_sse2(struct rfxencode *enc, const char *qtable,
+ uint8 *data,
+ uint8 *buffer, int buffer_size, int *size)
+{
+ LLOGLN(10, ("rfx_encode_component_rlgr3_x86_sse2:"));
+#if defined(RFX_USE_ACCEL_X86)
+ if (rfxcodec_encode_dwt_shift_x86_sse2(qtable, data, enc->dwt_buffer1,
+ enc->dwt_buffer) != 0)
+ {
+ return 1;
+ }
+ //*size = rfxcodec_encode_diff_rlgr3_x86_sse2(enc->dwt_buffer1,
+ // buffer, buffer_size);
+ if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0)
+ {
+ return 1;
+ }
+ *size = rfx_rlgr3_encode(enc->dwt_buffer1, buffer, buffer_size);
+#endif
+ return 0;
+}
+
+/******************************************************************************/
+int
+rfx_encode_component_rlgr1_x86_sse41(struct rfxencode *enc, const char *qtable,
+ uint8 *data,
+ uint8 *buffer, int buffer_size, int *size)
+{
+ LLOGLN(10, ("rfx_encode_component_rlgr1_x86_sse41:"));
+#if defined(RFX_USE_ACCEL_X86)
+ if (rfxcodec_encode_dwt_shift_x86_sse41(qtable, data, enc->dwt_buffer1,
+ enc->dwt_buffer) != 0)
+ {
+ return 1;
+ }
+ //*size = rfxcodec_encode_diff_rlgr1_x86_sse2(enc->dwt_buffer1,
+ // buffer, buffer_size);
+ if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0)
+ {
+ return 1;
+ }
+ *size = rfx_rlgr1_encode(enc->dwt_buffer1, buffer, buffer_size);
+#endif
+ return 0;
+}
+
+/******************************************************************************/
+int
+rfx_encode_component_rlgr3_x86_sse41(struct rfxencode *enc, const char *qtable,
+ uint8 *data,
+ uint8 *buffer, int buffer_size, int *size)
+{
+ LLOGLN(10, ("rfx_encode_component_rlgr3_x86_sse41:"));
+#if defined(RFX_USE_ACCEL_X86)
+ if (rfxcodec_encode_dwt_shift_x86_sse41(qtable, data, enc->dwt_buffer1,
+ enc->dwt_buffer) != 0)
{
return 1;
}
+ //*size = rfxcodec_encode_diff_rlgr3_x86_sse(enc->dwt_buffer1,
+ // buffer, buffer_size);
if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0)
{
return 1;
}
- *size = rfx_rlgr3_encode(enc->dwt_buffer1, 4096, buffer, buffer_size);
+ *size = rfx_rlgr3_encode(enc->dwt_buffer1, buffer, buffer_size);
+#endif
return 0;
}
/******************************************************************************/
int
-rfx_encode_component_x86_sse2(struct rfxencode *enc,
- const int *quantization_values,
- uint8 *data,
- uint8 *buffer, int buffer_size, int *size)
+rfx_encode_component_rlgr1_amd64_sse2(struct rfxencode *enc, const char *qtable,
+ uint8 *data,
+ uint8 *buffer, int buffer_size, int *size)
{
- LLOGLN(10, ("rfx_encode_component_x86_sse2:"));
-#if defined(RFX_USE_ACCEL) && RFX_USE_ACCEL
- /* put asm calls here */
- if (dwt_shift_x86_sse2(quantization_values, data, enc->dwt_buffer1,
- enc->dwt_buffer) != 0)
+ LLOGLN(10, ("rfx_encode_component_rlgr1_amd64_sse2:"));
+#if defined(RFX_USE_ACCEL_AMD64)
+ if (rfxcodec_encode_dwt_shift_amd64_sse2(qtable, data, enc->dwt_buffer1,
+ enc->dwt_buffer) != 0)
+ {
+ return 1;
+ }
+ //*size = rfxcodec_encode_diff_rlgr1_amd64_sse2(enc->dwt_buffer1,
+ // buffer, buffer_size);
+ if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0)
{
return 1;
}
- *size = diff_rlgr3_x86(enc->dwt_buffer1, 4096, buffer, buffer_size);
+ *size = rfx_rlgr1_encode(enc->dwt_buffer1, buffer, buffer_size);
+#endif
+ return 0;
+}
+
+/******************************************************************************/
+int
+rfx_encode_component_rlgr3_amd64_sse2(struct rfxencode *enc, const char *qtable,
+ uint8 *data,
+ uint8 *buffer, int buffer_size, int *size)
+{
+ LLOGLN(10, ("rfx_encode_component_rlgr3_amd64_sse2:"));
+#if defined(RFX_USE_ACCEL_AMD64)
+ if (rfxcodec_encode_dwt_shift_amd64_sse2(qtable, data, enc->dwt_buffer1,
+ enc->dwt_buffer) != 0)
+ {
+ return 1;
+ }
+ //*size = rfxcodec_encode_diff_rlgr3_amd64_sse2(enc->dwt_buffer1,
+ // buffer, buffer_size);
+ if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0)
+ {
+ return 1;
+ }
+ *size = rfx_rlgr3_encode(enc->dwt_buffer1, buffer, buffer_size);
+#endif
+ return 0;
+}
+
+/******************************************************************************/
+int
+rfx_encode_component_rlgr1_amd64_sse41(struct rfxencode *enc, const char *qtable,
+ uint8 *data,
+ uint8 *buffer, int buffer_size, int *size)
+{
+ LLOGLN(10, ("rfx_encode_component_rlgr1_amd64_sse2:"));
+#if defined(RFX_USE_ACCEL_AMD64)
+ if (rfxcodec_encode_dwt_shift_amd64_sse41(qtable, data, enc->dwt_buffer1,
+ enc->dwt_buffer) != 0)
+ {
+ return 1;
+ }
+ //*size = rfxcodec_encode_diff_rlgr1_amd64_sse2(enc->dwt_buffer1,
+ // buffer, buffer_size);
+ if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0)
+ {
+ return 1;
+ }
+ *size = rfx_rlgr1_encode(enc->dwt_buffer1, buffer, buffer_size);
+#endif
+ return 0;
+}
+
+/******************************************************************************/
+int
+rfx_encode_component_rlgr3_amd64_sse41(struct rfxencode *enc, const char *qtable,
+ uint8 *data,
+ uint8 *buffer, int buffer_size, int *size)
+{
+ LLOGLN(10, ("rfx_encode_component_rlgr3_amd64_sse2:"));
+#if defined(RFX_USE_ACCEL_AMD64)
+ if (rfxcodec_encode_dwt_shift_amd64_sse41(qtable, data, enc->dwt_buffer1,
+ enc->dwt_buffer) != 0)
+ {
+ return 1;
+ }
+ //*size = rfxcodec_encode_diff_rlgr3_amd64_sse2(enc->dwt_buffer1,
+ // buffer, buffer_size);
+ if (rfx_differential_encode(enc->dwt_buffer1 + 4032, 64) != 0)
+ {
+ return 1;
+ }
+ *size = rfx_rlgr3_encode(enc->dwt_buffer1, buffer, buffer_size);
#endif
return 0;
}
@@ -221,23 +672,84 @@ rfx_encode_component_x86_sse2(struct rfxencode *enc,
int
rfx_encode_rgb(struct rfxencode *enc, char *rgb_data,
int width, int height, int stride_bytes,
- const int *y_quants, const int *cb_quants, const int *cr_quants,
- STREAM *data_out, int *y_size, int *cb_size, int *cr_size)
+ const char *y_quants, const char *u_quants,
+ const char *v_quants,
+ STREAM *data_out, int *y_size, int *u_size, int *v_size)
{
uint8 *y_r_buffer;
- uint8 *cb_g_buffer;
- uint8 *cr_b_buffer;
+ uint8 *u_g_buffer;
+ uint8 *v_b_buffer;
y_r_buffer = enc->y_r_buffer;
- cb_g_buffer = enc->cb_g_buffer;
- cr_b_buffer = enc->cr_b_buffer;
+ u_g_buffer = enc->u_g_buffer;
+ v_b_buffer = enc->v_b_buffer;
if (rfx_encode_format_rgb(rgb_data, width, height, stride_bytes,
enc->format,
- y_r_buffer, cb_g_buffer, cr_b_buffer) != 0)
+ y_r_buffer, u_g_buffer, v_b_buffer) != 0)
+ {
+ return 1;
+ }
+ if (rfx_encode_rgb_to_yuv(y_r_buffer, u_g_buffer, v_b_buffer) != 0)
+ {
+ return 1;
+ }
+ if (enc->rfx_encode(enc, y_quants, y_r_buffer,
+ stream_get_tail(data_out),
+ stream_get_left(data_out),
+ y_size) != 0)
+ {
+ return 1;
+ }
+ LLOGLN(10, ("rfx_encode_rgb: y_size %d", *y_size));
+ stream_seek(data_out, *y_size);
+ if (enc->rfx_encode(enc, u_quants, u_g_buffer,
+ stream_get_tail(data_out),
+ stream_get_left(data_out),
+ u_size) != 0)
{
return 1;
}
- if (rfx_encode_rgb_to_ycbcr(y_r_buffer, cb_g_buffer, cr_b_buffer) != 0)
+ LLOGLN(10, ("rfx_encode_rgb: u_size %d", *u_size));
+ stream_seek(data_out, *u_size);
+ if (enc->rfx_encode(enc, v_quants, v_b_buffer,
+ stream_get_tail(data_out),
+ stream_get_left(data_out),
+ v_size) != 0)
+ {
+ return 1;
+ }
+ LLOGLN(10, ("rfx_encode_rgb: v_size %d", *v_size));
+ stream_seek(data_out, *v_size);
+ return 0;
+}
+
+/******************************************************************************/
+int
+rfx_encode_argb(struct rfxencode *enc, char *rgb_data,
+ int width, int height, int stride_bytes,
+ const char *y_quants, const char *u_quants,
+ const char *v_quants,
+ STREAM *data_out, int *y_size, int *u_size,
+ int *v_size, int *a_size)
+{
+ uint8 *a_buffer;
+ uint8 *y_r_buffer;
+ uint8 *u_g_buffer;
+ uint8 *v_b_buffer;
+
+ LLOGLN(10, ("rfx_encode_argb:"));
+ a_buffer = enc->a_buffer;
+ y_r_buffer = enc->y_r_buffer;
+ u_g_buffer = enc->u_g_buffer;
+ v_b_buffer = enc->v_b_buffer;
+ if (rfx_encode_format_argb(rgb_data, width, height, stride_bytes,
+ enc->format,
+ a_buffer, y_r_buffer,
+ u_g_buffer, v_b_buffer) != 0)
+ {
+ return 1;
+ }
+ if (rfx_encode_rgb_to_yuv(y_r_buffer, u_g_buffer, v_b_buffer) != 0)
{
return 1;
}
@@ -250,24 +762,25 @@ rfx_encode_rgb(struct rfxencode *enc, char *rgb_data,
}
LLOGLN(10, ("rfx_encode_rgb: y_size %d", *y_size));
stream_seek(data_out, *y_size);
- if (enc->rfx_encode(enc, cb_quants, cb_g_buffer,
+ if (enc->rfx_encode(enc, u_quants, u_g_buffer,
stream_get_tail(data_out),
stream_get_left(data_out),
- cb_size) != 0)
+ u_size) != 0)
{
return 1;
}
- LLOGLN(10, ("rfx_encode_rgb: cb_size %d", *cb_size));
- stream_seek(data_out, *cb_size);
- if (enc->rfx_encode(enc, cr_quants, cr_b_buffer,
+ LLOGLN(10, ("rfx_encode_rgb: u_size %d", *u_size));
+ stream_seek(data_out, *u_size);
+ if (enc->rfx_encode(enc, v_quants, v_b_buffer,
stream_get_tail(data_out),
stream_get_left(data_out),
- cr_size) != 0)
+ v_size) != 0)
{
return 1;
}
- LLOGLN(10, ("rfx_encode_rgb: cr_size %d", *cr_size));
- stream_seek(data_out, *cr_size);
+ LLOGLN(10, ("rfx_encode_rgb: v_size %d", *v_size));
+ stream_seek(data_out, *v_size);
+ *a_size = rfx_encode_plane(enc, a_buffer, 64, 64, data_out);
return 0;
}
@@ -275,7 +788,8 @@ rfx_encode_rgb(struct rfxencode *enc, char *rgb_data,
int
rfx_encode_yuv(struct rfxencode *enc, char *yuv_data,
int width, int height, int stride_bytes,
- const int *y_quants, const int *u_quants, const int *v_quants,
+ const char *y_quants, const char *u_quants,
+ const char *v_quants,
STREAM *data_out, int *y_size, int *u_size, int *v_size)
{
uint8 *y_buffer;
@@ -311,3 +825,50 @@ rfx_encode_yuv(struct rfxencode *enc, char *yuv_data,
stream_seek(data_out, *v_size);
return 0;
}
+
+/******************************************************************************/
+int
+rfx_encode_yuva(struct rfxencode *enc, char *yuva_data,
+ int width, int height, int stride_bytes,
+ const char *y_quants, const char *u_quants,
+ const char *v_quants,
+ STREAM *data_out, int *y_size, int *u_size,
+ int *v_size, int *a_size)
+{
+ uint8 *y_buffer;
+ uint8 *u_buffer;
+ uint8 *v_buffer;
+ uint8 *a_buffer;
+
+ y_buffer = (uint8 *) yuva_data;
+ u_buffer = (uint8 *) (yuva_data + RFX_YUV_BTES);
+ v_buffer = (uint8 *) (yuva_data + RFX_YUV_BTES * 2);
+ a_buffer = (uint8 *) (yuva_data + RFX_YUV_BTES * 3);
+ if (enc->rfx_encode(enc, y_quants, y_buffer,
+ stream_get_tail(data_out),
+ stream_get_left(data_out),
+ y_size) != 0)
+ {
+ return 1;
+ }
+ stream_seek(data_out, *y_size);
+ if (enc->rfx_encode(enc, u_quants, u_buffer,
+ stream_get_tail(data_out),
+ stream_get_left(data_out),
+ u_size) != 0)
+ {
+ return 1;
+ }
+ stream_seek(data_out, *u_size);
+ if (enc->rfx_encode(enc, v_quants, v_buffer,
+ stream_get_tail(data_out),
+ stream_get_left(data_out),
+ v_size) != 0)
+ {
+ return 1;
+ }
+ stream_seek(data_out, *v_size);
+ *a_size = rfx_encode_plane(enc, a_buffer, 64, 64, data_out);
+ return 0;
+}
+
diff --git a/src/rfxencode_tile.h b/src/rfxencode_tile.h
index 01604c6..6195d8d 100644
--- a/src/rfxencode_tile.h
+++ b/src/rfxencode_tile.h
@@ -3,7 +3,7 @@
* RemoteFX Codec Library - Encode
*
* Copyright 2011 Vic Lee
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -27,34 +27,71 @@
#define RFX_YUV_BTES (64 * 64)
int
-rfx_encode_component_rlgr1(struct rfxencode *enc,
- const int *quantization_values,
+rfx_encode_component_rlgr1(struct rfxencode *enc, const char *qtable,
uint8 *data,
uint8 *buffer, int buffer_size, int *size);
int
-rfx_encode_component_rlgr3(struct rfxencode *enc,
- const int *quantization_values,
+rfx_encode_component_rlgr3(struct rfxencode *enc, const char *qtable,
uint8 *data,
uint8 *buffer, int buffer_size, int *size);
int
-rfx_encode_component_x86_sse2(struct rfxencode *enc,
- const int *quantization_values,
- uint8 *data,
- uint8 *buffer, int buffer_size, int *size);
-int
-rfx_encode_component_amd64_sse2(struct rfxencode *enc,
- const int *quantization_values,
- uint8 *data,
- uint8 *buffer, int buffer_size, int *size);
-int
rfx_encode_rgb(struct rfxencode *enc, char *rgb_data,
int width, int height, int stride_bytes,
- const int *y_quants, const int *cb_quants, const int *cr_quants,
+ const char *y_quants, const char *u_quants,
+ const char *v_quants,
STREAM *data_out, int *y_size, int *cb_size, int *cr_size);
int
+rfx_encode_argb(struct rfxencode *enc, char *argb_data,
+ int width, int height, int stride_bytes,
+ const char *y_quants, const char *cb_quants,
+ const char *cr_quants,
+ STREAM *data_out, int *y_size, int *u_size,
+ int *v_size, int *a_size);
+int
rfx_encode_yuv(struct rfxencode *enc, char *yuv_data,
int width, int height, int stride_bytes,
- const int *y_quants, const int *u_quants, const int *v_quants,
+ const char *y_quants, const char *u_quants,
+ const char *v_quants,
STREAM *data_out, int *y_size, int *u_size, int *v_size);
+int
+rfx_encode_yuva(struct rfxencode *enc, char *yuv_data,
+ int width, int height, int stride_bytes,
+ const char *y_quants, const char *u_quants,
+ const char *v_quants,
+ STREAM *data_out, int *y_size, int *u_size,
+ int *v_size, int *a_size);
+
+int
+rfx_encode_component_rlgr1_x86_sse2(struct rfxencode *enc, const char *qtable,
+ uint8 *data,
+ uint8 *buffer, int buffer_size, int *size);
+int
+rfx_encode_component_rlgr3_x86_sse2(struct rfxencode *enc, const char *qtable,
+ uint8 *data,
+ uint8 *buffer, int buffer_size, int *size);
+int
+rfx_encode_component_rlgr1_x86_sse41(struct rfxencode *enc, const char *qtable,
+ uint8 *data,
+ uint8 *buffer, int buffer_size, int *size);
+int
+rfx_encode_component_rlgr3_x86_sse41(struct rfxencode *enc, const char *qtable,
+ uint8 *data,
+ uint8 *buffer, int buffer_size, int *size);
+int
+rfx_encode_component_rlgr1_amd64_sse2(struct rfxencode *enc, const char *qtable,
+ uint8 *data,
+ uint8 *buffer, int buffer_size, int *size);
+int
+rfx_encode_component_rlgr3_amd64_sse2(struct rfxencode *enc, const char *qtable,
+ uint8 *data,
+ uint8 *buffer, int buffer_size, int *size);
+int
+rfx_encode_component_rlgr1_amd64_sse41(struct rfxencode *enc, const char *qtable,
+ uint8 *data,
+ uint8 *buffer, int buffer_size, int *size);
+int
+rfx_encode_component_rlgr3_amd64_sse41(struct rfxencode *enc, const char *qtable,
+ uint8 *data,
+ uint8 *buffer, int buffer_size, int *size);
#endif
diff --git a/src/x86/cpuid_x86.asm b/src/x86/cpuid_x86.asm
index 6f9e8c2..fe19a90 100644
--- a/src/x86/cpuid_x86.asm
+++ b/src/x86/cpuid_x86.asm
@@ -1,3 +1,6 @@
+%ifidn __OUTPUT_FORMAT__,elf
+SECTION .note.GNU-stack noalloc noexec nowrite progbits
+%endif
SECTION .text
@@ -10,7 +13,11 @@ SECTION .text
;int
;cpuid_x86(int eax_in, int ecx_in, int *eax, int *ebx, int *ecx, int *edx)
+%ifidn __OUTPUT_FORMAT__,elf
PROC cpuid_x86
+%else
+PROC _cpuid_x86
+%endif
; save registers
push ebx
push ecx
diff --git a/src/x86/funcs_x86.h b/src/x86/funcs_x86.h
index 6025d0a..858bc5c 100644
--- a/src/x86/funcs_x86.h
+++ b/src/x86/funcs_x86.h
@@ -1,5 +1,5 @@
/*
-Copyright 2014 Jay Sorg
+Copyright 2014-2015 Jay Sorg
Permission to use, copy, modify, distribute, and sell this software and its
documentation for any purpose is hereby granted without fee, provided that
@@ -24,12 +24,49 @@ x86 asm files
#ifndef __FUNCS_X86_H
#define __FUNCS_X86_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
int
cpuid_x86(int eax_in, int ecx_in, int *eax, int *ebx, int *ecx, int *edx);
+
int
-dwt_shift_x86_sse2(unsigned char *yuvs, int width, int height, int *rgbs);
+rfxcodec_encode_dwt_shift_x86_sse2(const char *qtable,
+ unsigned char *data,
+ short *dwt_buffer1,
+ short *dwt_buffer);
int
-diff_rlgr3_x86(sint16 *co, int num_co, uint8 *dst, int dst_bytes);
+rfxcodec_encode_dwt_shift_x86_sse41(const char *qtable,
+ unsigned char *data,
+ short *dwt_buffer1,
+ short *dwt_buffer);
+int
+rfxcodec_encode_diff_rlgr1_x86_sse2(short *co,
+ void *dst, int dst_bytes);
+int
+rfxcodec_encode_diff_rlgr3_x86_sse2(short *co,
+ void *dst, int dst_bytes);
+
+int
+rfxcodec_decode_rlgr1_diff_x86_sse2(void *data, int data_bytes,
+ short *out_data);
+int
+rfxcodec_decode_rlgr3_diff_x86_sse2(void *data, int data_bytes,
+ short *out_data);
+int
+rfxcodec_decode_shift_idwt_x86_sse2(const char *qtable, short *src, short *dst);
+int
+rfxcodec_decode_yuv2rgb_x86_sse2(short *ydata, short *udata, short *vdata,
+ unsigned int *rgbdata, int stride);
+int
+rfxcodec_decode_yuva2argb_x86_sse2(short *ydata, short *udata,
+ short *vdata, char *adata,
+ unsigned int *rgbdata, int stride);
+
+#ifdef __cplusplus
+}
#endif
+#endif
diff --git a/src/x86/readme.txt b/src/x86/readme.txt
deleted file mode 100644
index e69de29..0000000
--- a/src/x86/readme.txt
+++ /dev/null
diff --git a/src/x86/rfxcodec_encode_diff_rlgr1_x86_sse2.asm b/src/x86/rfxcodec_encode_diff_rlgr1_x86_sse2.asm
new file mode 100644
index 0000000..13d10e9
--- /dev/null
+++ b/src/x86/rfxcodec_encode_diff_rlgr1_x86_sse2.asm
@@ -0,0 +1,35 @@
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+section .data
+ const1 times 8 dw 1
+
+section .text
+
+%macro PROC 1
+ align 16
+ global %1
+ %1:
+%endmacro
+
+;int
+;rfxcodec_encode_diff_rlgr1_x86_sse2(short *co,
+; void *dst, int dst_bytes);
+
+%ifidn __OUTPUT_FORMAT__,elf
+PROC rfxcodec_encode_diff_rlgr1_x86_sse2
+%else
+PROC _rfxcodec_encode_diff_rlgr1_x86_sse2
+%endif
+ push ebx
+ push esi
+ push edi
+
+ mov eax, 0
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ align 16
+
diff --git a/src/x86/rfxcodec_encode_diff_rlgr3_x86_sse2.asm b/src/x86/rfxcodec_encode_diff_rlgr3_x86_sse2.asm
new file mode 100644
index 0000000..a8588f2
--- /dev/null
+++ b/src/x86/rfxcodec_encode_diff_rlgr3_x86_sse2.asm
@@ -0,0 +1,35 @@
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+section .data
+ const1 times 8 dw 1
+
+section .text
+
+%macro PROC 1
+ align 16
+ global %1
+ %1:
+%endmacro
+
+;int
+;rfxcodec_encode_diff_rlgr3_x86_sse2(short *co,
+; void *dst, int dst_bytes);
+
+%ifidn __OUTPUT_FORMAT__,elf
+PROC rfxcodec_encode_diff_rlgr3_x86_sse2
+%else
+PROC _rfxcodec_encode_diff_rlgr3_x86_sse2
+%endif
+ push ebx
+ push esi
+ push edi
+
+ mov eax, 0
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ align 16
+
diff --git a/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm b/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm
new file mode 100644
index 0000000..f6b71b2
--- /dev/null
+++ b/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm
@@ -0,0 +1,1533 @@
+;
+;Copyright 2016 Jay Sorg
+;
+;Permission to use, copy, modify, distribute, and sell this software and its
+;documentation for any purpose is hereby granted without fee, provided that
+;the above copyright notice appear in all copies and that both that
+;copyright notice and this permission notice appear in supporting
+;documentation.
+;
+;The above copyright notice and this permission notice shall be included in
+;all copies or substantial portions of the Software.
+;
+;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+;
+;x86 asm dwt
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+section .data
+ align 16
+ cw128 times 8 dw 128
+ cdFFFF times 4 dd 65535
+ ; these are 1 << (factor - 1) 0 to 15 is factor
+ cwa0 times 8 dw 0 ; 0
+ cwa1 times 8 dw 1 ; 1
+ cwa2 times 8 dw 2 ; 2
+ cwa4 times 8 dw 4 ; 3
+ cwa8 times 8 dw 8 ; 4
+ cwa16 times 8 dw 16 ; 5
+ cwa32 times 8 dw 32 ; 6
+ cwa64 times 8 dw 64 ; 7
+ cwa128 times 8 dw 128 ; 8
+ cwa256 times 8 dw 256 ; 9
+ cwa512 times 8 dw 512 ; 10
+ cwa1024 times 8 dw 1024 ; 11
+ cwa2048 times 8 dw 2048 ; 12
+ cwa4096 times 8 dw 4096 ; 13
+ cwa8192 times 8 dw 8192 ; 14
+ cwa16384 times 8 dw 16384 ; 15
+
+section .text
+
+%macro PROC 1
+ align 16
+ global %1
+ %1:
+%endmacro
+
+%define LHI_ADD [esp + 1 * 16 + 4]
+%define LHI_SFT [esp + 2 * 16 + 4]
+%define LLO_ADD [esp + 3 * 16 + 4]
+%define LLO_SFT [esp + 4 * 16 + 4]
+
+;******************************************************************************
+; source 16 bit signed, 16 pixel width
+rfx_dwt_2d_encode_block_horiz_16_16:
+ mov ecx, 8
+loop1a:
+ ; pre / post
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, LLO_ADD
+ psraw xmm6, LLO_SFT
+ movdqa [edx], xmm6
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ ; move left
+ lea esi, [esi - 16 * 2]
+ lea edi, [edi - 8 * 2]
+ lea edx, [edx - 8 * 2]
+
+ ; move down
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ dec ecx
+ jnz loop1a
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 16 pixel width
+rfx_dwt_2d_encode_block_verti_16_16:
+ mov ecx, 2
+loop1b:
+ ; pre
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16 * 2] ; src[2n + 1]
+ movdqa xmm3, [esi + 16 * 2 * 2] ; src[2n + 2]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [edi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm1
+ movdqa [edx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea esi, [esi + 16 * 2 * 2] ; 2 rows
+ lea edi, [edi + 16 * 2] ; 1 row
+ lea edx, [edx + 16 * 2] ; 1 row
+
+ ; loop
+ shl ecx, 16
+ mov cx, 6
+loop2b:
+ movdqa xmm1, xmm3 ; src[2n]
+ movdqa xmm2, [esi + 16 * 2] ; src[2n + 1]
+ movdqa xmm3, [esi + 16 * 2 * 2] ; src[2n + 2]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [edi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [edx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea esi, [esi + 16 * 2 * 2] ; 2 rows
+ lea edi, [edi + 16 * 2] ; 1 row
+ lea edx, [edx + 16 * 2] ; 1 row
+
+ dec cx
+ jnz loop2b
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, xmm3 ; src[2n]
+ movdqa xmm2, [esi + 16 * 2] ; src[2n + 1]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [edi], xmm5 ; out hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [edx], xmm5 ; out lo
+ ; move down
+ lea esi, [esi + 16 * 2 * 2] ; 2 row
+ lea edi, [edi + 16 * 2] ; 1 row
+ lea edx, [edx + 16 * 2] ; 1 row
+
+ ; move up
+ lea esi, [esi - 16 * 16 * 2]
+ lea edi, [edi - 8 * 16 * 2]
+ lea edx, [edx - 8 * 16 * 2]
+
+ ; move right
+ lea esi, [esi + 16]
+ lea edi, [edi + 16]
+ lea edx, [edx + 16]
+
+ dec ecx
+ jnz loop1b
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_horiz_16_32:
+ mov ecx, 16
+loop1c:
+ ; pre
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [esi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, LLO_ADD
+ psraw xmm6, LLO_SFT
+ movdqa [edx], xmm6
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ ; post
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, LLO_ADD
+ psraw xmm6, LLO_SFT
+ movdqa [edx], xmm6
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ ; move left
+ lea esi, [esi - 32 * 2]
+ lea edi, [edi - 16 * 2]
+ lea edx, [edx - 16 * 2]
+
+ ; move down
+ lea esi, [esi + 32 * 2]
+ lea edi, [edi + 16 * 2]
+ lea edx, [edx + 16 * 2]
+
+ dec ecx
+ jnz loop1c
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_horiz_16_32_no_lo:
+ mov ecx, 16
+loop1c1:
+ ; pre
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [esi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa [edx], xmm5 ; out lo
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ ; post
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa [edx], xmm5 ; out lo
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ ; move left
+ lea esi, [esi - 32 * 2]
+ lea edi, [edi - 16 * 2]
+ lea edx, [edx - 16 * 2]
+
+ ; move down
+ lea esi, [esi + 32 * 2]
+ lea edi, [edi + 16 * 2]
+ lea edx, [edx + 16 * 2]
+
+ dec ecx
+ jnz loop1c1
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_verti_16_32:
+ mov ecx, 4
+loop1d:
+ ; pre
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 32 * 2] ; src[2n + 1]
+ movdqa xmm3, [esi + 32 * 2 * 2] ; src[2n + 2]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [edi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm1
+ movdqa [edx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea esi, [esi + 32 * 2 * 2] ; 2 rows
+ lea edi, [edi + 32 * 2] ; 1 row
+ lea edx, [edx + 32 * 2] ; 1 row
+
+ ; loop
+ shl ecx, 16
+ mov cx, 14
+loop2d:
+ movdqa xmm1, xmm3 ; src[2n]
+ movdqa xmm2, [esi + 32 * 2] ; src[2n + 1]
+ movdqa xmm3, [esi + 32 * 2 * 2] ; src[2n + 2]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [edi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [edx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea esi, [esi + 32 * 2 * 2] ; 2 rows
+ lea edi, [edi + 32 * 2] ; 1 row
+ lea edx, [edx + 32 * 2] ; 1 row
+
+ dec cx
+ jnz loop2d
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, xmm3 ; src[2n]
+ movdqa xmm2, [esi + 32 * 2] ; src[2n + 1]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [edi], xmm5 ; out hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [edx], xmm5 ; out lo
+ ; move down
+ lea esi, [esi + 32 * 2 * 2] ; 2 row
+ lea edi, [edi + 32 * 2] ; 1 row
+ lea edx, [edx + 32 * 2] ; 1 row
+
+ ; move up
+ lea esi, [esi - 32 * 32 * 2]
+ lea edi, [edi - 16 * 32 * 2]
+ lea edx, [edx - 16 * 32 * 2]
+
+ ; move right
+ lea esi, [esi + 16]
+ lea edi, [edi + 16]
+ lea edx, [edx + 16]
+
+ dec ecx
+ jnz loop1d
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 64 pixel width
+rfx_dwt_2d_encode_block_horiz_16_64:
+ mov ecx, 32
+loop1e:
+ ; pre
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [esi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, LLO_ADD
+ psraw xmm6, LLO_SFT
+ movdqa [edx], xmm6
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ ; loop
+ shl ecx, 16
+ mov cx, 2
+loop2e:
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [esi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, LLO_ADD
+ psraw xmm6, LLO_SFT
+ movdqa [edx], xmm6
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ dec cx
+ jnz loop2e
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, LLO_ADD
+ psraw xmm6, LLO_SFT
+ movdqa [edx], xmm6
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ ; move left
+ lea esi, [esi - 64 * 2]
+ lea edi, [edi - 32 * 2]
+ lea edx, [edx - 32 * 2]
+
+ ; move down
+ lea esi, [esi + 64 * 2]
+ lea edi, [edi + 32 * 2]
+ lea edx, [edx + 32 * 2]
+
+ dec ecx
+ jnz loop1e
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 64 pixel width
+rfx_dwt_2d_encode_block_horiz_16_64_no_lo:
+ mov ecx, 32
+loop1e1:
+ ; pre
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [esi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa [edx], xmm5 ; out lo
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ ; loop
+ shl ecx, 16
+ mov cx, 2
+loop2e1:
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [esi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa [edx], xmm5 ; out lo
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ dec cx
+ jnz loop2e1
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ pslld xmm1, 16
+ pslld xmm2, 16
+ psrad xmm1, 16
+ psrad xmm2, 16
+ packssdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ pslld xmm2, 16
+ pslld xmm3, 16
+ psrad xmm2, 16
+ psrad xmm3, 16
+ packssdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ pslld xmm3, 16
+ pslld xmm4, 16
+ psrad xmm3, 16
+ psrad xmm4, 16
+ packssdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa [edx], xmm5 ; out lo
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ ; move left
+ lea esi, [esi - 64 * 2]
+ lea edi, [edi - 32 * 2]
+ lea edx, [edx - 32 * 2]
+
+ ; move down
+ lea esi, [esi + 64 * 2]
+ lea edi, [edi + 32 * 2]
+ lea edx, [edx + 32 * 2]
+
+ dec ecx
+ jnz loop1e1
+
+ ret
+
+;******************************************************************************
+; source 8 bit unsigned, 64 pixel width
+rfx_dwt_2d_encode_block_verti_8_64:
+ mov ecx, 8
+loop1f:
+ ; pre
+ movq xmm1, [esi] ; src[2n]
+ movq xmm2, [esi + 64 * 1] ; src[2n + 1]
+ movq xmm3, [esi + 64 * 1 * 2] ; src[2n + 2]
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+ psubw xmm1, [cw128]
+ psubw xmm2, [cw128]
+ psubw xmm3, [cw128]
+ psllw xmm1, 5
+ psllw xmm2, 5
+ psllw xmm3, 5
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [edi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm1
+ movdqa [edx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea esi, [esi + 64 * 1 * 2] ; 2 rows
+ lea edi, [edi + 64 * 2] ; 1 row
+ lea edx, [edx + 64 * 2] ; 1 row
+
+ ; loop
+ shl ecx, 16
+ mov cx, 30
+loop2f:
+ movdqa xmm1, xmm3 ; src[2n]
+ movq xmm2, [esi + 64 * 1] ; src[2n + 1]
+ movq xmm3, [esi + 64 * 1 * 2] ; src[2n + 2]
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+ psubw xmm2, [cw128]
+ psubw xmm3, [cw128]
+ psllw xmm2, 5
+ psllw xmm3, 5
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [edi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [edx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea esi, [esi + 64 * 1 * 2] ; 2 rows
+ lea edi, [edi + 64 * 2] ; 1 row
+ lea edx, [edx + 64 * 2] ; 1 row
+
+ dec cx
+ jnz loop2f
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, xmm3 ; src[2n]
+ movq xmm2, [esi + 64 * 1] ; src[2n + 1]
+ punpcklbw xmm2, xmm0
+ psubw xmm2, [cw128]
+ psllw xmm2, 5
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [edi], xmm5 ; out hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [edx], xmm5 ; out lo
+ ; move down
+ lea esi, [esi + 64 * 1 * 2] ; 2 rows
+ lea edi, [edi + 64 * 2] ; 1 row
+ lea edx, [edx + 64 * 2] ; 1 row
+
+ ; move up
+ lea esi, [esi - 64 * 1 * 64]
+ lea edi, [edi - 32 * 64 * 2]
+ lea edx, [edx - 32 * 64 * 2]
+
+ ; move right
+ lea esi, [esi + 8]
+ lea edi, [edi + 16]
+ lea edx, [edx + 16]
+
+ dec ecx
+ jnz loop1f
+
+ ret
+
+set_quants_hi:
+ sub eax, 6 - 5
+ movd xmm1, eax
+ movdqa LHI_SFT, xmm1
+ imul eax, 16
+ lea edx, [cwa0]
+ add edx, eax
+ movdqa xmm1, [edx]
+ movdqa LHI_ADD, xmm1
+ ret
+
+set_quants_lo:
+ sub eax, 6 - 5
+ movd xmm1, eax
+ movdqa LLO_SFT, xmm1
+ imul eax, 16
+ lea edx, [cwa0]
+ add edx, eax
+ movdqa xmm1, [edx]
+ movdqa LLO_ADD, xmm1
+ ret
+
+%define LQTABLE [esp + 144] ; qtable
+%define LIN_BUFFER [esp + 148] ; in_buffer
+%define LOUT_BUFFER [esp + 152] ; out_buffer
+%define LWORK_BUFFER [esp + 156] ; work_buffer
+
+;int
+;rfxcodec_encode_dwt_shift_x86_sse2(const char *qtable,
+; unsigned char *in_buffer,
+; short *out_buffer,
+; short *work_buffer);
+
+;******************************************************************************
+%ifidn __OUTPUT_FORMAT__,elf
+PROC rfxcodec_encode_dwt_shift_x86_sse2
+%else
+PROC _rfxcodec_encode_dwt_shift_x86_sse2
+%endif
+ ; align stack
+ mov eax, esp
+ sub eax, 0x10
+ and eax, 0x0F
+ sub esp, eax
+ push eax
+ sub esp, 3 * 4
+ sub esp, 4 * 4
+ ; copy params to after align
+ movdqu xmm0, [esp + eax + 4 * 4 + 3 * 4 + 4 + 4]
+ movdqu [esp], xmm0
+ ; save registers
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 16 * 8
+ pxor xmm0, xmm0
+
+ ; verical DWT to work buffer, level 1
+ mov esi, LIN_BUFFER ; src
+ mov edi, LWORK_BUFFER ; dst hi
+ lea edi, [edi + 64 * 32 * 2] ; dst hi
+ mov edx, LWORK_BUFFER ; dst lo
+ call rfx_dwt_2d_encode_block_verti_8_64
+
+ ; horizontal DWT to out buffer, level 1, part 1
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 4]
+ and al, 0xF
+ call set_quants_hi
+ mov esi, LWORK_BUFFER ; src
+ mov edi, LOUT_BUFFER ; dst hi - HL1
+ mov edx, LOUT_BUFFER ; dst lo - LL1
+ lea edx, [edx + 32 * 32 * 6] ; dst lo - LL1
+ call rfx_dwt_2d_encode_block_horiz_16_64_no_lo
+
+ ; horizontal DWT to out buffer, level 1, part 2
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 4]
+ shr al, 4
+ call set_quants_hi
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 3]
+ shr al, 4
+ call set_quants_lo
+ mov esi, LWORK_BUFFER ; src
+ lea esi, [esi + 64 * 32 * 2] ; src
+ mov edi, LOUT_BUFFER ; dst hi - HH1
+ lea edi, [edi + 32 * 32 * 4] ; dst hi - HH1
+ mov edx, LOUT_BUFFER ; dst lo - LH1
+ lea edx, [edx + 32 * 32 * 2] ; dst lo - LH1
+ call rfx_dwt_2d_encode_block_horiz_16_64
+
+ ; verical DWT to work buffer, level 2
+ mov esi, LOUT_BUFFER ; src
+ lea esi, [esi + 32 * 32 * 6] ; src
+ mov edi, LWORK_BUFFER ; dst hi
+ lea edi, [edi + 32 * 16 * 2] ; dst hi
+ mov edx, LWORK_BUFFER ; dst lo
+ call rfx_dwt_2d_encode_block_verti_16_32
+
+ ; horizontal DWT to out buffer, level 2, part 1
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 2]
+ shr al, 4
+ call set_quants_hi
+ mov esi, LWORK_BUFFER ; src
+ ; 32 * 32 * 6 + 16 * 16 * 0 = 6144
+ mov edi, LOUT_BUFFER ; dst hi - HL2
+ lea edi, [edi + 6144] ; dst hi - HL2
+ ; 32 * 32 * 6 + 16 * 16 * 6 = 7680
+ mov edx, LOUT_BUFFER ; dst lo - LL2
+ lea edx, [edx + 7680] ; dst lo - LL2
+ call rfx_dwt_2d_encode_block_horiz_16_32_no_lo
+
+ ; horizontal DWT to out buffer, level 2, part 2
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 3]
+ and al, 0xF
+ call set_quants_hi
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 2]
+ and al, 0xF
+ call set_quants_lo
+ mov esi, LWORK_BUFFER ; src
+ lea esi, [esi + 32 * 16 * 2] ; src
+ ; 32 * 32 * 6 + 16 * 16 * 4 = 7168
+ mov edi, LOUT_BUFFER ; dst hi - HH2
+ lea edi, [edi + 7168] ; dst hi - HH2
+ ; 32 * 32 * 6 + 16 * 16 * 2 = 6656
+ mov edx, LOUT_BUFFER ; dst lo - LH2
+ lea edx, [edx + 6656] ; dst lo - LH2
+ call rfx_dwt_2d_encode_block_horiz_16_32
+
+ ; verical DWT to work buffer, level 3
+ ; 32 * 32 * 6 + 16 * 16 * 6 = 7680
+ mov esi, LOUT_BUFFER ; src
+ lea esi, [esi + 7680] ; src
+ mov edi, LWORK_BUFFER ; dst hi
+ lea edi, [edi + 16 * 8 * 2] ; dst hi
+ mov edx, LWORK_BUFFER ; dst lo
+ call rfx_dwt_2d_encode_block_verti_16_16
+
+ ; horizontal DWT to out buffer, level 3, part 1
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 1]
+ and al, 0xF
+ call set_quants_hi
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 0]
+ and al, 0xF
+ call set_quants_lo
+ mov esi, LWORK_BUFFER ; src
+ ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 0 = 7680
+ mov edi, LOUT_BUFFER ; dst hi - HL3
+ lea edi, [edi + 7680] ; dst hi - HL3
+ ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 6 = 8064
+ mov edx, LOUT_BUFFER ; dst lo - LL3
+ lea edx, [edx + 8064] ; dst lo - LL3
+ call rfx_dwt_2d_encode_block_horiz_16_16
+
+ ; horizontal DWT to out buffer, level 3, part 2
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 1]
+ shr al, 4
+ call set_quants_hi
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 0]
+ shr al, 4
+ call set_quants_lo
+ mov esi, LWORK_BUFFER ; src
+ lea esi, [esi + 16 * 8 * 2] ; src
+ ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 4 = 7936
+ mov edi, LOUT_BUFFER ; dst hi - HH3
+ lea edi, [edi + 7936] ; dst hi - HH3
+ ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 2 = 7808
+ mov edx, LOUT_BUFFER ; dst lo - LH3
+ lea edx, [edx + 7808] ; dst lo - LH3
+ call rfx_dwt_2d_encode_block_horiz_16_16
+
+ ; quants
+ add esp, 16 * 8
+ ; restore registers
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ; params
+ add esp, 3 * 4
+ add esp, 4 * 4
+ ; align
+ pop eax
+ add esp, eax
+ ; return value
+ mov eax, 0
+ ret
+ align 16
+
diff --git a/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm b/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm
new file mode 100644
index 0000000..cb117da
--- /dev/null
+++ b/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm
@@ -0,0 +1,1401 @@
+;
+;Copyright 2016 Jay Sorg
+;
+;Permission to use, copy, modify, distribute, and sell this software and its
+;documentation for any purpose is hereby granted without fee, provided that
+;the above copyright notice appear in all copies and that both that
+;copyright notice and this permission notice appear in supporting
+;documentation.
+;
+;The above copyright notice and this permission notice shall be included in
+;all copies or substantial portions of the Software.
+;
+;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+;
+;x86 asm dwt
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+section .data
+ align 16
+ cw128 times 8 dw 128
+ cdFFFF times 4 dd 65535
+ ; these are 1 << (factor - 1) 0 to 15 is factor
+ cwa0 times 8 dw 0 ; 0
+ cwa1 times 8 dw 1 ; 1
+ cwa2 times 8 dw 2 ; 2
+ cwa4 times 8 dw 4 ; 3
+ cwa8 times 8 dw 8 ; 4
+ cwa16 times 8 dw 16 ; 5
+ cwa32 times 8 dw 32 ; 6
+ cwa64 times 8 dw 64 ; 7
+ cwa128 times 8 dw 128 ; 8
+ cwa256 times 8 dw 256 ; 9
+ cwa512 times 8 dw 512 ; 10
+ cwa1024 times 8 dw 1024 ; 11
+ cwa2048 times 8 dw 2048 ; 12
+ cwa4096 times 8 dw 4096 ; 13
+ cwa8192 times 8 dw 8192 ; 14
+ cwa16384 times 8 dw 16384 ; 15
+
+section .text
+
+%macro PROC 1
+ align 16
+ global %1
+ %1:
+%endmacro
+
+%define LHI_ADD [esp + 1 * 16 + 4]
+%define LHI_SFT [esp + 2 * 16 + 4]
+%define LLO_ADD [esp + 3 * 16 + 4]
+%define LLO_SFT [esp + 4 * 16 + 4]
+
+;******************************************************************************
+; source 16 bit signed, 16 pixel width
+rfx_dwt_2d_encode_block_horiz_16_16:
+ mov ecx, 8
+loop1a:
+ ; pre / post
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, LLO_ADD
+ psraw xmm6, LLO_SFT
+ movdqa [edx], xmm6
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ ; move left
+ lea esi, [esi - 16 * 2]
+ lea edi, [edi - 8 * 2]
+ lea edx, [edx - 8 * 2]
+
+ ; move down
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ dec ecx
+ jnz loop1a
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 16 pixel width
+rfx_dwt_2d_encode_block_verti_16_16:
+ mov ecx, 2
+loop1b:
+ ; pre
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16 * 2] ; src[2n + 1]
+ movdqa xmm3, [esi + 16 * 2 * 2] ; src[2n + 2]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [edi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm1
+ movdqa [edx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea esi, [esi + 16 * 2 * 2] ; 2 rows
+ lea edi, [edi + 16 * 2] ; 1 row
+ lea edx, [edx + 16 * 2] ; 1 row
+
+ ; loop
+ shl ecx, 16
+ mov cx, 6
+loop2b:
+ movdqa xmm1, xmm3 ; src[2n]
+ movdqa xmm2, [esi + 16 * 2] ; src[2n + 1]
+ movdqa xmm3, [esi + 16 * 2 * 2] ; src[2n + 2]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [edi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [edx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea esi, [esi + 16 * 2 * 2] ; 2 rows
+ lea edi, [edi + 16 * 2] ; 1 row
+ lea edx, [edx + 16 * 2] ; 1 row
+
+ dec cx
+ jnz loop2b
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, xmm3 ; src[2n]
+ movdqa xmm2, [esi + 16 * 2] ; src[2n + 1]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [edi], xmm5 ; out hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [edx], xmm5 ; out lo
+ ; move down
+ lea esi, [esi + 16 * 2 * 2] ; 2 row
+ lea edi, [edi + 16 * 2] ; 1 row
+ lea edx, [edx + 16 * 2] ; 1 row
+
+ ; move up
+ lea esi, [esi - 16 * 16 * 2]
+ lea edi, [edi - 8 * 16 * 2]
+ lea edx, [edx - 8 * 16 * 2]
+
+ ; move right
+ lea esi, [esi + 16]
+ lea edi, [edi + 16]
+ lea edx, [edx + 16]
+
+ dec ecx
+ jnz loop1b
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_horiz_16_32:
+ mov ecx, 16
+loop1c:
+ ; pre
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [esi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, LLO_ADD
+ psraw xmm6, LLO_SFT
+ movdqa [edx], xmm6
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ ; post
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, LLO_ADD
+ psraw xmm6, LLO_SFT
+ movdqa [edx], xmm6
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ ; move left
+ lea esi, [esi - 32 * 2]
+ lea edi, [edi - 16 * 2]
+ lea edx, [edx - 16 * 2]
+
+ ; move down
+ lea esi, [esi + 32 * 2]
+ lea edi, [edi + 16 * 2]
+ lea edx, [edx + 16 * 2]
+
+ dec ecx
+ jnz loop1c
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_horiz_16_32_no_lo:
+ mov ecx, 16
+loop1c1:
+ ; pre
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [esi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa [edx], xmm5 ; out lo
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ ; post
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa [edx], xmm5 ; out lo
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ ; move left
+ lea esi, [esi - 32 * 2]
+ lea edi, [edi - 16 * 2]
+ lea edx, [edx - 16 * 2]
+
+ ; move down
+ lea esi, [esi + 32 * 2]
+ lea edi, [edi + 16 * 2]
+ lea edx, [edx + 16 * 2]
+
+ dec ecx
+ jnz loop1c1
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 32 pixel width
+rfx_dwt_2d_encode_block_verti_16_32:
+ mov ecx, 4
+loop1d:
+ ; pre
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 32 * 2] ; src[2n + 1]
+ movdqa xmm3, [esi + 32 * 2 * 2] ; src[2n + 2]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [edi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm1
+ movdqa [edx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea esi, [esi + 32 * 2 * 2] ; 2 rows
+ lea edi, [edi + 32 * 2] ; 1 row
+ lea edx, [edx + 32 * 2] ; 1 row
+
+ ; loop
+ shl ecx, 16
+ mov cx, 14
+loop2d:
+ movdqa xmm1, xmm3 ; src[2n]
+ movdqa xmm2, [esi + 32 * 2] ; src[2n + 1]
+ movdqa xmm3, [esi + 32 * 2 * 2] ; src[2n + 2]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [edi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [edx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea esi, [esi + 32 * 2 * 2] ; 2 rows
+ lea edi, [edi + 32 * 2] ; 1 row
+ lea edx, [edx + 32 * 2] ; 1 row
+
+ dec cx
+ jnz loop2d
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, xmm3 ; src[2n]
+ movdqa xmm2, [esi + 32 * 2] ; src[2n + 1]
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [edi], xmm5 ; out hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [edx], xmm5 ; out lo
+ ; move down
+ lea esi, [esi + 32 * 2 * 2] ; 2 row
+ lea edi, [edi + 32 * 2] ; 1 row
+ lea edx, [edx + 32 * 2] ; 1 row
+
+ ; move up
+ lea esi, [esi - 32 * 32 * 2]
+ lea edi, [edi - 16 * 32 * 2]
+ lea edx, [edx - 16 * 32 * 2]
+
+ ; move right
+ lea esi, [esi + 16]
+ lea edi, [edi + 16]
+ lea edx, [edx + 16]
+
+ dec ecx
+ jnz loop1d
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 64 pixel width
+rfx_dwt_2d_encode_block_horiz_16_64:
+ mov ecx, 32
+loop1e:
+ ; pre
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [esi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, LLO_ADD
+ psraw xmm6, LLO_SFT
+ movdqa [edx], xmm6
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ ; loop
+ shl ecx, 16
+ mov cx, 2
+loop2e:
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [esi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, LLO_ADD
+ psraw xmm6, LLO_SFT
+ movdqa [edx], xmm6
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ dec cx
+ jnz loop2e
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa xmm6, xmm5 ; out lo
+ paddw xmm6, LLO_ADD
+ psraw xmm6, LLO_SFT
+ movdqa [edx], xmm6
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ ; move left
+ lea esi, [esi - 64 * 2]
+ lea edi, [edi - 32 * 2]
+ lea edx, [edx - 32 * 2]
+
+ ; move down
+ lea esi, [esi + 64 * 2]
+ lea edi, [edi + 32 * 2]
+ lea edx, [edx + 32 * 2]
+
+ dec ecx
+ jnz loop1e
+
+ ret
+
+;******************************************************************************
+; source 16 bit signed, 64 pixel width
+rfx_dwt_2d_encode_block_horiz_16_64_no_lo:
+ mov ecx, 32
+loop1e1:
+ ; pre
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [esi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ movd eax, xmm7
+ pslldq xmm7, 2
+ and eax, 0xFFFF
+ movd xmm6, eax
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa [edx], xmm5 ; out lo
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ ; loop
+ shl ecx, 16
+ mov cx, 2
+loop2e1:
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ mov eax, [esi + 32]
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+ movdqa xmm2, xmm5 ; save hi
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ psrldq xmm2, 14
+ movd ebx, xmm2 ; save hi
+
+ movdqa [edx], xmm5 ; out lo
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ dec cx
+ jnz loop2e1
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, [esi] ; src[2n]
+ movdqa xmm2, [esi + 16]
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm2
+ pand xmm1, [cdFFFF]
+ pand xmm2, [cdFFFF]
+ packusdw xmm1, xmm2
+ movdqa xmm2, xmm6 ; src[2n + 1]
+ movdqa xmm3, xmm7
+ psrldq xmm2, 2
+ psrldq xmm3, 2
+ pand xmm2, [cdFFFF]
+ pand xmm3, [cdFFFF]
+ packusdw xmm2, xmm3
+ movdqa xmm3, xmm6 ; src[2n + 2]
+ movdqa xmm4, xmm7
+ psrldq xmm3, 4
+ psrldq xmm4, 4
+ movd eax, xmm7
+ movd xmm5, eax
+ pslldq xmm5, 12
+ por xmm3, xmm5
+ movdqa xmm5, xmm7
+ psrldq xmm5, 12
+ pslldq xmm5, 12
+ por xmm4, xmm5
+ pand xmm3, [cdFFFF]
+ pand xmm4, [cdFFFF]
+ packusdw xmm3, xmm4
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+
+ movdqa xmm6, xmm5 ; out hi
+ paddw xmm6, LHI_ADD
+ psraw xmm6, LHI_SFT
+ movdqa [edi], xmm6
+
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ movdqa xmm7, xmm5
+ pslldq xmm7, 2
+ movd xmm6, ebx
+ por xmm7, xmm6
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+
+ movdqa [edx], xmm5 ; out lo
+
+ ; move right
+ lea esi, [esi + 16 * 2]
+ lea edi, [edi + 8 * 2]
+ lea edx, [edx + 8 * 2]
+
+ ; move left
+ lea esi, [esi - 64 * 2]
+ lea edi, [edi - 32 * 2]
+ lea edx, [edx - 32 * 2]
+
+ ; move down
+ lea esi, [esi + 64 * 2]
+ lea edi, [edi + 32 * 2]
+ lea edx, [edx + 32 * 2]
+
+ dec ecx
+ jnz loop1e1
+
+ ret
+
+;******************************************************************************
+; source 8 bit unsigned, 64 pixel width
+rfx_dwt_2d_encode_block_verti_8_64:
+ mov ecx, 8
+loop1f:
+ ; pre
+ movq xmm1, [esi] ; src[2n]
+ movq xmm2, [esi + 64 * 1] ; src[2n + 1]
+ movq xmm3, [esi + 64 * 1 * 2] ; src[2n + 2]
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+ psubw xmm1, [cw128]
+ psubw xmm2, [cw128]
+ psubw xmm3, [cw128]
+ psllw xmm1, 5
+ psllw xmm2, 5
+ psllw xmm3, 5
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [edi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm1
+ movdqa [edx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea esi, [esi + 64 * 1 * 2] ; 2 rows
+ lea edi, [edi + 64 * 2] ; 1 row
+ lea edx, [edx + 64 * 2] ; 1 row
+
+ ; loop
+ shl ecx, 16
+ mov cx, 30
+loop2f:
+ movdqa xmm1, xmm3 ; src[2n]
+ movq xmm2, [esi + 64 * 1] ; src[2n + 1]
+ movq xmm3, [esi + 64 * 1 * 2] ; src[2n + 2]
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+ psubw xmm2, [cw128]
+ psubw xmm3, [cw128]
+ psllw xmm2, 5
+ psllw xmm3, 5
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [edi], xmm5 ; out hi
+ movdqa xmm6, xmm5 ; save hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [edx], xmm5 ; out lo
+ movdqa xmm7, xmm6 ; save hi
+ ; move down
+ lea esi, [esi + 64 * 1 * 2] ; 2 rows
+ lea edi, [edi + 64 * 2] ; 1 row
+ lea edx, [edx + 64 * 2] ; 1 row
+
+ dec cx
+ jnz loop2f
+ shr ecx, 16
+
+ ; post
+ movdqa xmm1, xmm3 ; src[2n]
+ movq xmm2, [esi + 64 * 1] ; src[2n + 1]
+ punpcklbw xmm2, xmm0
+ psubw xmm2, [cw128]
+ psllw xmm2, 5
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm3
+ ; h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1
+ paddw xmm4, xmm6
+ psraw xmm4, 1
+ psubw xmm5, xmm4
+ psraw xmm5, 1
+ movdqa [edi], xmm5 ; out hi
+ ; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
+ paddw xmm5, xmm7
+ psraw xmm5, 1
+ paddw xmm5, xmm1
+ movdqa [edx], xmm5 ; out lo
+ ; move down
+ lea esi, [esi + 64 * 1 * 2] ; 2 rows
+ lea edi, [edi + 64 * 2] ; 1 row
+ lea edx, [edx + 64 * 2] ; 1 row
+
+ ; move up
+ lea esi, [esi - 64 * 1 * 64]
+ lea edi, [edi - 32 * 64 * 2]
+ lea edx, [edx - 32 * 64 * 2]
+
+ ; move right
+ lea esi, [esi + 8]
+ lea edi, [edi + 16]
+ lea edx, [edx + 16]
+
+ dec ecx
+ jnz loop1f
+
+ ret
+
+set_quants_hi:
+ sub eax, 6 - 5
+ movd xmm1, eax
+ movdqa LHI_SFT, xmm1
+ imul eax, 16
+ lea edx, [cwa0]
+ add edx, eax
+ movdqa xmm1, [edx]
+ movdqa LHI_ADD, xmm1
+ ret
+
+set_quants_lo:
+ sub eax, 6 - 5
+ movd xmm1, eax
+ movdqa LLO_SFT, xmm1
+ imul eax, 16
+ lea edx, [cwa0]
+ add edx, eax
+ movdqa xmm1, [edx]
+ movdqa LLO_ADD, xmm1
+ ret
+
+%define LQTABLE [esp + 144] ; qtable
+%define LIN_BUFFER [esp + 148] ; in_buffer
+%define LOUT_BUFFER [esp + 152] ; out_buffer
+%define LWORK_BUFFER [esp + 156] ; work_buffer
+
+;int
+;rfxcodec_encode_dwt_shift_x86_sse41(const char *qtable,
+; unsigned char *in_buffer,
+; short *out_buffer,
+; short *work_buffer);
+
+;******************************************************************************
+%ifidn __OUTPUT_FORMAT__,elf
+PROC rfxcodec_encode_dwt_shift_x86_sse41
+%else
+PROC _rfxcodec_encode_dwt_shift_x86_sse41
+%endif
+ ; align stack
+ mov eax, esp
+ sub eax, 0x10
+ and eax, 0x0F
+ sub esp, eax
+ push eax
+ sub esp, 3 * 4
+ sub esp, 4 * 4
+ ; copy params to after align
+ movdqu xmm0, [esp + eax + 4 * 4 + 3 * 4 + 4 + 4]
+ movdqu [esp], xmm0
+ ; save registers
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 16 * 8
+ pxor xmm0, xmm0
+
+ ; verical DWT to work buffer, level 1
+ mov esi, LIN_BUFFER ; src
+ mov edi, LWORK_BUFFER ; dst hi
+ lea edi, [edi + 64 * 32 * 2] ; dst hi
+ mov edx, LWORK_BUFFER ; dst lo
+ call rfx_dwt_2d_encode_block_verti_8_64
+
+ ; horizontal DWT to out buffer, level 1, part 1
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 4]
+ and al, 0xF
+ call set_quants_hi
+ mov esi, LWORK_BUFFER ; src
+ mov edi, LOUT_BUFFER ; dst hi - HL1
+ mov edx, LOUT_BUFFER ; dst lo - LL1
+ lea edx, [edx + 32 * 32 * 6] ; dst lo - LL1
+ call rfx_dwt_2d_encode_block_horiz_16_64_no_lo
+
+ ; horizontal DWT to out buffer, level 1, part 2
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 4]
+ shr al, 4
+ call set_quants_hi
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 3]
+ shr al, 4
+ call set_quants_lo
+ mov esi, LWORK_BUFFER ; src
+ lea esi, [esi + 64 * 32 * 2] ; src
+ mov edi, LOUT_BUFFER ; dst hi - HH1
+ lea edi, [edi + 32 * 32 * 4] ; dst hi - HH1
+ mov edx, LOUT_BUFFER ; dst lo - LH1
+ lea edx, [edx + 32 * 32 * 2] ; dst lo - LH1
+ call rfx_dwt_2d_encode_block_horiz_16_64
+
+ ; verical DWT to work buffer, level 2
+ mov esi, LOUT_BUFFER ; src
+ lea esi, [esi + 32 * 32 * 6] ; src
+ mov edi, LWORK_BUFFER ; dst hi
+ lea edi, [edi + 32 * 16 * 2] ; dst hi
+ mov edx, LWORK_BUFFER ; dst lo
+ call rfx_dwt_2d_encode_block_verti_16_32
+
+ ; horizontal DWT to out buffer, level 2, part 1
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 2]
+ shr al, 4
+ call set_quants_hi
+ mov esi, LWORK_BUFFER ; src
+ ; 32 * 32 * 6 + 16 * 16 * 0 = 6144
+ mov edi, LOUT_BUFFER ; dst hi - HL2
+ lea edi, [edi + 6144] ; dst hi - HL2
+ ; 32 * 32 * 6 + 16 * 16 * 6 = 7680
+ mov edx, LOUT_BUFFER ; dst lo - LL2
+ lea edx, [edx + 7680] ; dst lo - LL2
+ call rfx_dwt_2d_encode_block_horiz_16_32_no_lo
+
+ ; horizontal DWT to out buffer, level 2, part 2
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 3]
+ and al, 0xF
+ call set_quants_hi
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 2]
+ and al, 0xF
+ call set_quants_lo
+ mov esi, LWORK_BUFFER ; src
+ lea esi, [esi + 32 * 16 * 2] ; src
+ ; 32 * 32 * 6 + 16 * 16 * 4 = 7168
+ mov edi, LOUT_BUFFER ; dst hi - HH2
+ lea edi, [edi + 7168] ; dst hi - HH2
+ ; 32 * 32 * 6 + 16 * 16 * 2 = 6656
+ mov edx, LOUT_BUFFER ; dst lo - LH2
+ lea edx, [edx + 6656] ; dst lo - LH2
+ call rfx_dwt_2d_encode_block_horiz_16_32
+
+ ; verical DWT to work buffer, level 3
+ ; 32 * 32 * 6 + 16 * 16 * 6 = 7680
+ mov esi, LOUT_BUFFER ; src
+ lea esi, [esi + 7680] ; src
+ mov edi, LWORK_BUFFER ; dst hi
+ lea edi, [edi + 16 * 8 * 2] ; dst hi
+ mov edx, LWORK_BUFFER ; dst lo
+ call rfx_dwt_2d_encode_block_verti_16_16
+
+ ; horizontal DWT to out buffer, level 3, part 1
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 1]
+ and al, 0xF
+ call set_quants_hi
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 0]
+ and al, 0xF
+ call set_quants_lo
+ mov esi, LWORK_BUFFER ; src
+ ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 0 = 7680
+ mov edi, LOUT_BUFFER ; dst hi - HL3
+ lea edi, [edi + 7680] ; dst hi - HL3
+ ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 6 = 8064
+ mov edx, LOUT_BUFFER ; dst lo - LL3
+ lea edx, [edx + 8064] ; dst lo - LL3
+ call rfx_dwt_2d_encode_block_horiz_16_16
+
+ ; horizontal DWT to out buffer, level 3, part 2
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 1]
+ shr al, 4
+ call set_quants_hi
+ xor eax, eax
+ mov edx, LQTABLE
+ mov al, [edx + 0]
+ shr al, 4
+ call set_quants_lo
+ mov esi, LWORK_BUFFER ; src
+ lea esi, [esi + 16 * 8 * 2] ; src
+ ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 4 = 7936
+ mov edi, LOUT_BUFFER ; dst hi - HH3
+ lea edi, [edi + 7936] ; dst hi - HH3
+ ; 32 * 32 * 6 + 16 * 16 * 6 + 8 * 8 * 2 = 7808
+ mov edx, LOUT_BUFFER ; dst lo - LH3
+ lea edx, [edx + 7808] ; dst lo - LH3
+ call rfx_dwt_2d_encode_block_horiz_16_16
+
+ ; quants
+ add esp, 16 * 8
+ ; restore registers
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ; params
+ add esp, 3 * 4
+ add esp, 4 * 4
+ ; align
+ pop eax
+ add esp, eax
+ ; return value
+ mov eax, 0
+ ret
+ align 16
+
diff --git a/src/x86/rfxdwt_x86_sse2.asm b/src/x86/rfxdwt_x86_sse2.asm
deleted file mode 100644
index dd2a2d9..0000000
--- a/src/x86/rfxdwt_x86_sse2.asm
+++ /dev/null
@@ -1,25 +0,0 @@
-
-section .data
- const1 times 8 dw 1
-
-%macro PROC 1
- align 16
- global %1
- %1:
-%endmacro
-
-;int
-;dwt_shift_x86_sse2(const int* qtable, sint8* src, sint16* dst, sint16* temp)
-
-PROC dwt_shift_x86_sse2
- push ebx
- push esi
- push edi
-
- mov eax, 0
- pop edi
- pop esi
- pop ebx
- ret
- align 16
-
diff --git a/src/x86/rfxrlgr1_x86.asm b/src/x86/rfxrlgr1_x86.asm
deleted file mode 100644
index 8441051..0000000
--- a/src/x86/rfxrlgr1_x86.asm
+++ /dev/null
@@ -1,25 +0,0 @@
-
-section .data
- const1 times 8 dw 1
-
-%macro PROC 1
- align 16
- global %1
- %1:
-%endmacro
-
-;int
-;diff_rlgr1_x86(sint16 *co, int num_co, uint8 *dst, int dst_bytes);
-
-PROC diff_rlgr1_x86
- push ebx
- push esi
- push edi
-
- mov eax, 0
- pop edi
- pop esi
- pop ebx
- ret
- align 16
-
diff --git a/src/x86/rfxrlgr3_x86.asm b/src/x86/rfxrlgr3_x86.asm
deleted file mode 100644
index 08b278d..0000000
--- a/src/x86/rfxrlgr3_x86.asm
+++ /dev/null
@@ -1,25 +0,0 @@
-
-section .data
- const1 times 8 dw 1
-
-%macro PROC 1
- align 16
- global %1
- %1:
-%endmacro
-
-;int
-;diff_rlgr3_x86(sint16 *co, int num_co, uint8 *dst, int dst_bytes);
-
-PROC diff_rlgr3_x86
- push ebx
- push esi
- push edi
-
- mov eax, 0
- pop edi
- pop esi
- pop ebx
- ret
- align 16
-
diff --git a/tests/Makefile b/tests/Makefile
deleted file mode 100644
index 36cd57d..0000000
--- a/tests/Makefile
+++ /dev/null
@@ -1,22 +0,0 @@
-
-OBJS = rfxcodectest.o
-
-CFLAGS = -g -O2 -Wall -fPIC -I../include
-
-# this for linking to .so
-#LDFLAGS = $(PROFIL) -L../src -Wl,-rpath=../src
-# this if using .a
-LDFLAGS = $(PROFIL)
-
-# this for linking to .so
-#LIBS = -lrfxencode
-# this for using .a
-LIBS = ../src/librfxencode.a
-
-all: rfxcodectest
-
-rfxcodectest: $(OBJS) Makefile
- $(CC) -o rfxcodectest $(LDFLAGS) $(OBJS) $(LIBS)
-
-clean:
- rm -f $(OBJS) rfxcodectest
diff --git a/tests/Makefile.am b/tests/Makefile.am
new file mode 100644
index 0000000..8e24edc
--- /dev/null
+++ b/tests/Makefile.am
@@ -0,0 +1,11 @@
+EXTRA_DIST = readme.txt
+
+AM_CPPFLAGS = \
+ -I$(top_srcdir)/include
+
+bin_PROGRAMS = rfxcodectest
+
+rfxcodectest_SOURCES = rfxcodectest.c
+
+rfxcodectest_LDADD = \
+ $(top_builddir)/src/librfxencode.la
diff --git a/tests/rfxcodectest.c b/tests/rfxcodectest.c
index f959185..6733db8 100644
--- a/tests/rfxcodectest.c
+++ b/tests/rfxcodectest.c
@@ -1,7 +1,7 @@
/**
* RFX codec encoder test
*
- * Copyright 2014 Jay Sorg <jay.sorg@gmail.com>
+ * Copyright 2014-2015 Jay Sorg <jay.sorg@gmail.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -27,11 +27,11 @@
#include <rfxcodec_encode.h>
-static const int g_rfx_default_quantization_values[] =
+static const unsigned char g_rfx_default_quantization_values[] =
{
/* LL3 LH3 HL3 HH3 LH2 HL2 HH2 LH1 HL1 HH1 */
- 6, 6, 6, 6, 7, 7, 8, 8, 8, 9,
- 9, 9, 9, 9, 10, 10, 12, 12, 12, 13
+ 0x66, 0x66, 0x77, 0x88, 0x98,
+ 0x99, 0x99, 0xaa, 0xcc, 0xdc
};
/*****************************************************************************/
@@ -46,7 +46,7 @@ get_mstime(void)
/******************************************************************************/
static int
-speed_random(int count, const int *quants)
+speed_random(int count, const char *quants)
{
void *han;
int error;
@@ -56,34 +56,44 @@ speed_random(int count, const int *quants)
char *cdata;
char *buf;
struct rfx_rect regions[1];
- struct rfx_tile tiles[1];
+ struct rfx_tile tiles[2];
int stime;
int etime;
int tiles_per_second;
int num_regions;
int num_tiles;
int num_quants;
+ int flags;
printf("speed_random:\n");
- han = rfxcodec_encode_create(1920, 1024, RFX_FORMAT_BGRA, RFX_FLAGS_RLGR1);
- if (han == 0)
+ //flags = RFX_FLAGS_RLGR1 | RFX_FLAGS_NOACCEL;
+ flags = RFX_FLAGS_RLGR1;
+ //flags = RFX_FLAGS_RLGR3;
+ //flags = RFX_FLAGS_RLGR1 | RFX_FLAGS_ALPHAV1;
+ error = rfxcodec_encode_create_ex(1920, 1024, RFX_FORMAT_BGRA, flags, &han);
+ if (error != 0)
{
- printf("speed_random: rfxcodec_encode_create failed\n");
+ printf("speed_random: rfxcodec_encode_create_ex failed\n");
return 1;
}
- printf("speed_random: rfxcodec_encode_create ok\n");
- cdata = (char *) malloc(64 * 64 * 4);
- cdata_bytes = 64 * 64 * 4;
- buf = (char *) malloc(64 * 64 * 4);
+ printf("speed_random: rfxcodec_encode_create_ex ok\n");
+ cdata = (char *) malloc(128 * 64 * 4);
+ cdata_bytes = 128 * 64 * 4;
+ buf = (char *) malloc(128 * 64 * 4);
+#if 1
fd = open("/dev/urandom", O_RDONLY);
- if (read(fd, buf, 64 * 64 * 4) != 64 * 64 * 4)
+ //fd = open("/dev/zero", O_RDONLY);
+ if (read(fd, buf, 128 * 64 * 4) != 128 * 64 * 4)
{
printf("speed_random: read error\n");
}
close(fd);
+#else
+ memset(buf, 0x7f, 128 * 64 * 4);
+#endif
regions[0].x = 0;
regions[0].y = 0;
- regions[0].cx = 64;
+ regions[0].cx = 128;
regions[0].cy = 64;
num_regions = 1;
tiles[0].x = 0;
@@ -93,22 +103,31 @@ speed_random(int count, const int *quants)
tiles[0].quant_y = 0;
tiles[0].quant_cb = 0;
tiles[0].quant_cr = 0;
+ tiles[1].x = 64;
+ tiles[1].y = 0;
+ tiles[1].cx = 64;
+ tiles[1].cy = 64;
+ tiles[1].quant_y = 0;
+ tiles[1].quant_cb = 0;
+ tiles[1].quant_cr = 0;
num_tiles = 1;
num_quants = 1;
error = 0;
stime = get_mstime();
+ flags = 0;
+ //flags = RFX_FLAGS_ALPHAV1;
for (index = 0; index < count; index++)
{
- error = rfxcodec_encode(han, cdata, &cdata_bytes, buf, 64, 64, 64 * 4,
- regions, num_regions, tiles, num_tiles,
- quants, num_quants);
+ error = rfxcodec_encode_ex(han, cdata, &cdata_bytes, buf, 64, 64, 64 * 4,
+ regions, num_regions, tiles, num_tiles,
+ quants, num_quants, flags);
if (error != 0)
{
break;
}
}
etime = get_mstime();
- tiles_per_second = count * 1000 / (etime - stime);
+ tiles_per_second = count * num_tiles * 1000 / (etime - stime + 1);
printf("speed_random: cdata_bytes %d count %d ms time %d "
"tiles_per_second %d\n",
cdata_bytes, count, etime - stime, tiles_per_second);
@@ -221,7 +240,7 @@ load_bmp_file(int in_fd, char **data, int *width, int *height)
/******************************************************************************/
static int
encode_file(char *data, int width, int height, char *cdata, int *cdata_bytes,
- const int *quants, int num_quants)
+ const char *quants, int num_quants)
{
int awidth;
int aheight;
@@ -235,10 +254,10 @@ encode_file(char *data, int width, int height, char *cdata, int *cdata_bytes,
void *han;
struct rfx_rect regions[1];
- han = rfxcodec_encode_create(1920, 1024, RFX_FORMAT_BGRA, RFX_FLAGS_RLGR1);
- if (han == 0)
+ error = rfxcodec_encode_create_ex(1920, 1024, RFX_FORMAT_BGRA, RFX_FLAGS_RLGR1, &han);
+ if (error != 0)
{
- printf("encode_file: rfxcodec_encode_create failed\n");
+ printf("encode_file: rfxcodec_encode_create_ex failed\n");
return 1;
}
@@ -269,9 +288,9 @@ encode_file(char *data, int width, int height, char *cdata, int *cdata_bytes,
regions[0].cy = height;
num_regions = 1;
- error = rfxcodec_encode(han, cdata, cdata_bytes, data, width, height, width * 4,
- regions, num_regions, tiles, num_tiles,
- quants, num_quants);
+ error = rfxcodec_encode_ex(han, cdata, cdata_bytes, data, width, height, width * 4,
+ regions, num_regions, tiles, num_tiles,
+ quants, num_quants, 0);
if (error != 0)
{
printf("encode_file: rfxcodec_encode failed error %d\n", error);
@@ -287,7 +306,7 @@ encode_file(char *data, int width, int height, char *cdata, int *cdata_bytes,
/******************************************************************************/
static int
-read_file(int count, const int *quants, int num_quants,
+read_file(int count, const char *quants, int num_quants,
const char *in_file, const char *out_file)
{
int in_fd;
@@ -380,7 +399,7 @@ main(int argc, char **argv)
int count;
char in_file[256];
char out_file[256];
- const int *quants = g_rfx_default_quantization_values;
+ const char *quants = (const char *) g_rfx_default_quantization_values;
do_speed = 0;
do_read = 0;