Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/neutrinolabs/librfxcodec.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJay Sorg <jay.sorg@gmail.com>2017-09-15 03:18:55 +0300
committerJay Sorg <jay.sorg@gmail.com>2017-09-15 03:18:55 +0300
commit29600d32b56d9f961a8d7ac3f6972206575d6f7d (patch)
treef93593fb4bab9ffda0fd5efdb65af4cc976688b6
parent0529c86405ac959a0f1a0c1464abc1509b67d589 (diff)
parentacd9f3bdaa779598603448921f04af6921e74587 (diff)
Merge branch 'mirabilos-asm-elf-pie' into devel
-rw-r--r--src/amd64/Makefile.am2
-rw-r--r--src/amd64/cpuid_amd64.asm5
-rw-r--r--src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm156
-rw-r--r--src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm156
-rw-r--r--src/common.asm75
-rw-r--r--src/x86/Makefile.am2
-rw-r--r--src/x86/cpuid_x86.asm7
-rw-r--r--src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm182
-rw-r--r--src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm182
9 files changed, 414 insertions, 353 deletions
diff --git a/src/amd64/Makefile.am b/src/amd64/Makefile.am
index 07aae69..2870ab5 100644
--- a/src/amd64/Makefile.am
+++ b/src/amd64/Makefile.am
@@ -1,3 +1,5 @@
+NAFLAGS += -DASM_ARCH_AMD64
+
AMD64_ASM = \
cpuid_amd64.asm \
rfxcodec_encode_dwt_shift_amd64_sse2.asm \
diff --git a/src/amd64/cpuid_amd64.asm b/src/amd64/cpuid_amd64.asm
index acc738e..38e2023 100644
--- a/src/amd64/cpuid_amd64.asm
+++ b/src/amd64/cpuid_amd64.asm
@@ -1,7 +1,5 @@
%include "common.asm"
-section .text
-
;The first six integer or pointer arguments are passed in registers
;RDI, RSI, RDX, RCX, R8, and R9
@@ -32,5 +30,4 @@ PROC cpuid_amd64
; restore registers
pop rbx
ret
- align 16
-
+END_OF_FILE
diff --git a/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm
index 69ccf07..cef3902 100644
--- a/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm
+++ b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse2.asm
@@ -21,8 +21,7 @@
%include "common.asm"
-section .data
- align 16
+PREPARE_RODATA
cw128 times 8 dw 128
cdFFFF times 4 dd 65535
; these are 1 << (factor - 1) 0 to 15 is factor
@@ -43,8 +42,6 @@ section .data
cwa8192 times 8 dw 8192 ; 14
cwa16384 times 8 dw 16384 ; 15
-section .text
-
;******************************************************************************
; source 16 bit signed, 16 pixel width
rfx_dwt_2d_encode_block_horiz_16_16:
@@ -55,8 +52,8 @@ loop1a:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -66,8 +63,8 @@ loop1a:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -85,8 +82,8 @@ loop1a:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -247,8 +244,8 @@ loop1c:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -258,8 +255,8 @@ loop1c:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -277,8 +274,8 @@ loop1c:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -328,8 +325,8 @@ loop1c:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -339,8 +336,8 @@ loop1c:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -358,8 +355,8 @@ loop1c:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -423,8 +420,8 @@ loop1c1:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -434,8 +431,8 @@ loop1c1:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -453,8 +450,8 @@ loop1c1:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -501,8 +498,8 @@ loop1c1:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -512,8 +509,8 @@ loop1c1:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -531,8 +528,8 @@ loop1c1:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -690,8 +687,8 @@ loop1e:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -701,8 +698,8 @@ loop1e:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -720,8 +717,8 @@ loop1e:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -774,8 +771,8 @@ loop2e:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -785,8 +782,8 @@ loop2e:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -804,8 +801,8 @@ loop2e:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -857,8 +854,8 @@ loop2e:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -868,8 +865,8 @@ loop2e:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -887,8 +884,8 @@ loop2e:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -952,8 +949,8 @@ loop1e1:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -963,8 +960,8 @@ loop1e1:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -982,8 +979,8 @@ loop1e1:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -1033,8 +1030,8 @@ loop2e1:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -1044,8 +1041,8 @@ loop2e1:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -1063,8 +1060,8 @@ loop2e1:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -1113,8 +1110,8 @@ loop2e1:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -1124,8 +1121,8 @@ loop2e1:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -1143,8 +1140,8 @@ loop2e1:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -1207,9 +1204,9 @@ loop1f:
punpcklbw xmm1, xmm0
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
- psubw xmm1, [rel cw128]
- psubw xmm2, [rel cw128]
- psubw xmm3, [rel cw128]
+ psubw xmm1, [lsym(cw128)]
+ psubw xmm2, [lsym(cw128)]
+ psubw xmm3, [lsym(cw128)]
psllw xmm1, 5
psllw xmm2, 5
psllw xmm3, 5
@@ -1241,8 +1238,8 @@ loop2f:
movq xmm3, [rsi + 64 * 1 * 2] ; src[2n + 2]
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
- psubw xmm2, [rel cw128]
- psubw xmm3, [rel cw128]
+ psubw xmm2, [lsym(cw128)]
+ psubw xmm3, [lsym(cw128)]
psllw xmm2, 5
psllw xmm3, 5
movdqa xmm4, xmm1
@@ -1274,7 +1271,7 @@ loop2f:
movdqa xmm1, xmm3 ; src[2n]
movq xmm2, [rsi + 64 * 1] ; src[2n + 1]
punpcklbw xmm2, xmm0
- psubw xmm2, [rel cw128]
+ psubw xmm2, [lsym(cw128)]
psllw xmm2, 5
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -1314,7 +1311,7 @@ set_quants_hi:
sub rax, 6 - 5
movd xmm9, eax
imul rax, 16
- lea rdx, [rel cwa0]
+ lea rdx, [lsym(cwa0)]
add rdx, rax
movdqa xmm8, [rdx]
ret
@@ -1323,7 +1320,7 @@ set_quants_lo:
sub rax, 6 - 5
movd xmm11, eax
imul rax, 16
- lea rdx, [rel cwa0]
+ lea rdx, [lsym(cwa0)]
add rdx, rax
movdqa xmm10, [rdx]
ret
@@ -1487,5 +1484,4 @@ PROC rfxcodec_encode_dwt_shift_amd64_sse2
pop rdx
pop rbx
ret
- align 16
-
+END_OF_FILE
diff --git a/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm
index 2b19f81..da176e7 100644
--- a/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm
+++ b/src/amd64/rfxcodec_encode_dwt_shift_amd64_sse41.asm
@@ -21,8 +21,7 @@
%include "common.asm"
-section .data
- align 16
+PREPARE_RODATA
cw128 times 8 dw 128
cdFFFF times 4 dd 65535
; these are 1 << (factor - 1) 0 to 15 is factor
@@ -43,8 +42,6 @@ section .data
cwa8192 times 8 dw 8192 ; 14
cwa16384 times 8 dw 16384 ; 15
-section .text
-
;******************************************************************************
; source 16 bit signed, 16 pixel width
rfx_dwt_2d_encode_block_horiz_16_16:
@@ -55,15 +52,15 @@ loop1a:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -77,8 +74,8 @@ loop1a:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -235,15 +232,15 @@ loop1c:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -257,8 +254,8 @@ loop1c:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -304,15 +301,15 @@ loop1c:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -326,8 +323,8 @@ loop1c:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -387,15 +384,15 @@ loop1c1:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -409,8 +406,8 @@ loop1c1:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -453,15 +450,15 @@ loop1c1:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -475,8 +472,8 @@ loop1c1:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -630,15 +627,15 @@ loop1e:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -652,8 +649,8 @@ loop1e:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -702,15 +699,15 @@ loop2e:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -724,8 +721,8 @@ loop2e:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -773,15 +770,15 @@ loop2e:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -795,8 +792,8 @@ loop2e:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -856,15 +853,15 @@ loop1e1:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -878,8 +875,8 @@ loop1e1:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -925,15 +922,15 @@ loop2e1:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -947,8 +944,8 @@ loop2e1:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -993,15 +990,15 @@ loop2e1:
movdqa xmm2, [rsi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [rel cdFFFF]
- pand xmm2, [rel cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [rel cdFFFF]
- pand xmm3, [rel cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -1015,8 +1012,8 @@ loop2e1:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [rel cdFFFF]
- pand xmm4, [rel cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -1075,9 +1072,9 @@ loop1f:
punpcklbw xmm1, xmm0
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
- psubw xmm1, [rel cw128]
- psubw xmm2, [rel cw128]
- psubw xmm3, [rel cw128]
+ psubw xmm1, [lsym(cw128)]
+ psubw xmm2, [lsym(cw128)]
+ psubw xmm3, [lsym(cw128)]
psllw xmm1, 5
psllw xmm2, 5
psllw xmm3, 5
@@ -1109,8 +1106,8 @@ loop2f:
movq xmm3, [rsi + 64 * 1 * 2] ; src[2n + 2]
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
- psubw xmm2, [rel cw128]
- psubw xmm3, [rel cw128]
+ psubw xmm2, [lsym(cw128)]
+ psubw xmm3, [lsym(cw128)]
psllw xmm2, 5
psllw xmm3, 5
movdqa xmm4, xmm1
@@ -1142,7 +1139,7 @@ loop2f:
movdqa xmm1, xmm3 ; src[2n]
movq xmm2, [rsi + 64 * 1] ; src[2n + 1]
punpcklbw xmm2, xmm0
- psubw xmm2, [rel cw128]
+ psubw xmm2, [lsym(cw128)]
psllw xmm2, 5
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -1182,7 +1179,7 @@ set_quants_hi:
sub rax, 6 - 5
movd xmm9, eax
imul rax, 16
- lea rdx, [rel cwa0]
+ lea rdx, [lsym(cwa0)]
add rdx, rax
movdqa xmm8, [rdx]
ret
@@ -1191,7 +1188,7 @@ set_quants_lo:
sub rax, 6 - 5
movd xmm11, eax
imul rax, 16
- lea rdx, [rel cwa0]
+ lea rdx, [lsym(cwa0)]
add rdx, rax
movdqa xmm10, [rdx]
ret
@@ -1355,5 +1352,4 @@ PROC rfxcodec_encode_dwt_shift_amd64_sse41
pop rdx
pop rbx
ret
- align 16
-
+END_OF_FILE
diff --git a/src/common.asm b/src/common.asm
index 7e2b84b..cf7102d 100644
--- a/src/common.asm
+++ b/src/common.asm
@@ -1,5 +1,6 @@
;
;Copyright 2017 Pavel Roskin
+;Copyright 2017 mirabilos
;
;Permission to use, copy, modify, distribute, and sell this software and its
;documentation for any purpose is hereby granted without fee, provided that
@@ -33,6 +34,19 @@
%define is_elf 1
%endif
+; Detect Mach-O formats
+%ifidn __OUTPUT_FORMAT__,macho
+%define is_macho 1
+%endif
+
+%ifidn __OUTPUT_FORMAT__,macho32
+%define is_macho 1
+%endif
+
+%ifidn __OUTPUT_FORMAT__,macho64
+%define is_macho 1
+%endif
+
; Mark stack non-executable
%ifdef is_elf
section .note.GNU-stack noalloc noexec nowrite progbits
@@ -49,3 +63,64 @@ section .note.GNU-stack noalloc noexec nowrite progbits
_%1:
%endif
%endmacro
+
+; Macros for relative access to local data
+%undef lsym
+
+%ifdef ASM_ARCH_AMD64
+; amd64; don't define or call RETRIEVE_RODATA
+%define lsym(name) rel name
+%endif
+
+%ifdef ASM_ARCH_I386
+%ifdef PIC
+; i386 PIC
+
+%macro END_OF_FILE 0
+%ifdef I386_PIC_NEEDED
+section .text
+..@get_caller_address:
+ mov ebx, [esp]
+ ret
+%endif
+%ifdef is_macho
+; see below
+ align 16
+%endif
+%endmacro
+
+%macro RETRIEVE_RODATA 0
+%define I386_PIC_NEEDED 1
+ call ..@get_caller_address
+%%the_caller_address:
+ sub ebx, %%the_caller_address - ..@rodata_begin
+%endmacro
+
+%define lsym(name) ebx + name - ..@rodata_begin
+%else
+; i386 non-PIC; default case for lsym and RETRIEVE_RODATA
+%endif
+%endif
+
+%ifndef lsym
+%macro RETRIEVE_RODATA 0
+%endmacro
+%define lsym(name) name
+%endif
+
+%macro PREPARE_RODATA 0
+section .text
+ align 16
+..@rodata_begin:
+%endmacro
+
+%ifnmacro END_OF_FILE 0
+%macro END_OF_FILE 0
+%ifdef is_macho
+; cf. https://github.com/libjpeg-turbo/libjpeg-turbo/blob/master/simd/jccolext-mmx.asm#L474-L476
+ align 16
+%endif
+%endmacro
+%endif
+
+section .text
diff --git a/src/x86/Makefile.am b/src/x86/Makefile.am
index 2d099e5..3c88cee 100644
--- a/src/x86/Makefile.am
+++ b/src/x86/Makefile.am
@@ -1,3 +1,5 @@
+NAFLAGS += -DASM_ARCH_I386
+
X86_ASM = \
cpuid_x86.asm \
rfxcodec_encode_dwt_shift_x86_sse2.asm \
diff --git a/src/x86/cpuid_x86.asm b/src/x86/cpuid_x86.asm
index 4ddb8a2..b666732 100644
--- a/src/x86/cpuid_x86.asm
+++ b/src/x86/cpuid_x86.asm
@@ -1,7 +1,5 @@
%include "common.asm"
-section .text
-
;int
;cpuid_x86(int eax_in, int ecx_in, int *eax, int *ebx, int *ecx, int *edx)
@@ -29,6 +27,5 @@ PROC cpuid_x86
pop edx
pop ecx
pop ebx
- ret;
- align 16
-
+ ret
+END_OF_FILE
diff --git a/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm b/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm
index fdfbae1..f05a705 100644
--- a/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm
+++ b/src/x86/rfxcodec_encode_dwt_shift_x86_sse2.asm
@@ -1,5 +1,6 @@
;
;Copyright 2016 Jay Sorg
+;Copyright 2017 mirabilos
;
;Permission to use, copy, modify, distribute, and sell this software and its
;documentation for any purpose is hereby granted without fee, provided that
@@ -21,8 +22,7 @@
%include "common.asm"
-section .data
- align 16
+PREPARE_RODATA
cw128 times 8 dw 128
cdFFFF times 4 dd 65535
; these are 1 << (factor - 1) 0 to 15 is factor
@@ -43,8 +43,6 @@ section .data
cwa8192 times 8 dw 8192 ; 14
cwa16384 times 8 dw 16384 ; 15
-section .text
-
%define LHI_ADD [esp + 1 * 16 + 4]
%define LHI_SFT [esp + 2 * 16 + 4]
%define LLO_ADD [esp + 3 * 16 + 4]
@@ -60,8 +58,8 @@ loop1a:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -71,8 +69,8 @@ loop1a:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -90,8 +88,8 @@ loop1a:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -252,8 +250,8 @@ loop1c:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -263,8 +261,8 @@ loop1c:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -282,8 +280,8 @@ loop1c:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -316,7 +314,7 @@ loop1c:
paddw xmm5, xmm1
psrldq xmm2, 14
- movd ebx, xmm2 ; save hi
+ movd ebp, xmm2 ; save hi
movdqa xmm6, xmm5 ; out lo
paddw xmm6, LLO_ADD
@@ -333,8 +331,8 @@ loop1c:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -344,8 +342,8 @@ loop1c:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -363,8 +361,8 @@ loop1c:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -387,7 +385,7 @@ loop1c:
; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
movdqa xmm7, xmm5
pslldq xmm7, 2
- movd xmm6, ebx
+ movd xmm6, ebp
por xmm7, xmm6
paddw xmm5, xmm7
psraw xmm5, 1
@@ -428,8 +426,8 @@ loop1c1:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -439,8 +437,8 @@ loop1c1:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -458,8 +456,8 @@ loop1c1:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -492,7 +490,7 @@ loop1c1:
paddw xmm5, xmm1
psrldq xmm2, 14
- movd ebx, xmm2 ; save hi
+ movd ebp, xmm2 ; save hi
movdqa [edx], xmm5 ; out lo
@@ -506,8 +504,8 @@ loop1c1:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -517,8 +515,8 @@ loop1c1:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -536,8 +534,8 @@ loop1c1:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -560,7 +558,7 @@ loop1c1:
; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
movdqa xmm7, xmm5
pslldq xmm7, 2
- movd xmm6, ebx
+ movd xmm6, ebp
por xmm7, xmm6
paddw xmm5, xmm7
psraw xmm5, 1
@@ -695,8 +693,8 @@ loop1e:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -706,8 +704,8 @@ loop1e:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -725,8 +723,8 @@ loop1e:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -759,7 +757,7 @@ loop1e:
paddw xmm5, xmm1
psrldq xmm2, 14
- movd ebx, xmm2 ; save hi
+ movd ebp, xmm2 ; save hi
movdqa xmm6, xmm5 ; out lo
paddw xmm6, LLO_ADD
@@ -779,8 +777,8 @@ loop2e:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -790,8 +788,8 @@ loop2e:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -809,8 +807,8 @@ loop2e:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -834,14 +832,14 @@ loop2e:
; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
movdqa xmm7, xmm5
pslldq xmm7, 2
- movd xmm6, ebx
+ movd xmm6, ebp
por xmm7, xmm6
paddw xmm5, xmm7
psraw xmm5, 1
paddw xmm5, xmm1
psrldq xmm2, 14
- movd ebx, xmm2 ; save hi
+ movd ebp, xmm2 ; save hi
movdqa xmm6, xmm5 ; out lo
paddw xmm6, LLO_ADD
@@ -862,8 +860,8 @@ loop2e:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -873,8 +871,8 @@ loop2e:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -892,8 +890,8 @@ loop2e:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -916,7 +914,7 @@ loop2e:
; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
movdqa xmm7, xmm5
pslldq xmm7, 2
- movd xmm6, ebx
+ movd xmm6, ebp
por xmm7, xmm6
paddw xmm5, xmm7
psraw xmm5, 1
@@ -957,8 +955,8 @@ loop1e1:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -968,8 +966,8 @@ loop1e1:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -987,8 +985,8 @@ loop1e1:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -1021,7 +1019,7 @@ loop1e1:
paddw xmm5, xmm1
psrldq xmm2, 14
- movd ebx, xmm2 ; save hi
+ movd ebp, xmm2 ; save hi
movdqa [edx], xmm5 ; out lo
@@ -1038,8 +1036,8 @@ loop2e1:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -1049,8 +1047,8 @@ loop2e1:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -1068,8 +1066,8 @@ loop2e1:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -1093,14 +1091,14 @@ loop2e1:
; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
movdqa xmm7, xmm5
pslldq xmm7, 2
- movd xmm6, ebx
+ movd xmm6, ebp
por xmm7, xmm6
paddw xmm5, xmm7
psraw xmm5, 1
paddw xmm5, xmm1
psrldq xmm2, 14
- movd ebx, xmm2 ; save hi
+ movd ebp, xmm2 ; save hi
movdqa [edx], xmm5 ; out lo
@@ -1118,8 +1116,8 @@ loop2e1:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
pslld xmm1, 16
pslld xmm2, 16
psrad xmm1, 16
@@ -1129,8 +1127,8 @@ loop2e1:
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
pslld xmm2, 16
pslld xmm3, 16
psrad xmm2, 16
@@ -1148,8 +1146,8 @@ loop2e1:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
pslld xmm3, 16
pslld xmm4, 16
psrad xmm3, 16
@@ -1172,7 +1170,7 @@ loop2e1:
; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
movdqa xmm7, xmm5
pslldq xmm7, 2
- movd xmm6, ebx
+ movd xmm6, ebp
por xmm7, xmm6
paddw xmm5, xmm7
psraw xmm5, 1
@@ -1212,9 +1210,9 @@ loop1f:
punpcklbw xmm1, xmm0
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
- psubw xmm1, [cw128]
- psubw xmm2, [cw128]
- psubw xmm3, [cw128]
+ psubw xmm1, [lsym(cw128)]
+ psubw xmm2, [lsym(cw128)]
+ psubw xmm3, [lsym(cw128)]
psllw xmm1, 5
psllw xmm2, 5
psllw xmm3, 5
@@ -1246,8 +1244,8 @@ loop2f:
movq xmm3, [esi + 64 * 1 * 2] ; src[2n + 2]
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
- psubw xmm2, [cw128]
- psubw xmm3, [cw128]
+ psubw xmm2, [lsym(cw128)]
+ psubw xmm3, [lsym(cw128)]
psllw xmm2, 5
psllw xmm3, 5
movdqa xmm4, xmm1
@@ -1279,7 +1277,7 @@ loop2f:
movdqa xmm1, xmm3 ; src[2n]
movq xmm2, [esi + 64 * 1] ; src[2n + 1]
punpcklbw xmm2, xmm0
- psubw xmm2, [cw128]
+ psubw xmm2, [lsym(cw128)]
psllw xmm2, 5
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -1320,7 +1318,7 @@ set_quants_hi:
movd xmm1, eax
movdqa LHI_SFT, xmm1
imul eax, 16
- lea edx, [cwa0]
+ lea edx, [lsym(cwa0)]
add edx, eax
movdqa xmm1, [edx]
movdqa LHI_ADD, xmm1
@@ -1331,7 +1329,7 @@ set_quants_lo:
movd xmm1, eax
movdqa LLO_SFT, xmm1
imul eax, 16
- lea edx, [cwa0]
+ lea edx, [lsym(cwa0)]
add edx, eax
movdqa xmm1, [edx]
movdqa LLO_ADD, xmm1
@@ -1363,6 +1361,7 @@ PROC rfxcodec_encode_dwt_shift_x86_sse2
movdqu [esp], xmm0
; save registers
push ebx
+ RETRIEVE_RODATA
push esi
push edi
push ebp
@@ -1517,5 +1516,4 @@ PROC rfxcodec_encode_dwt_shift_x86_sse2
; return value
mov eax, 0
ret
- align 16
-
+END_OF_FILE
diff --git a/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm b/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm
index 501a0bc..00d4b1d 100644
--- a/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm
+++ b/src/x86/rfxcodec_encode_dwt_shift_x86_sse41.asm
@@ -1,5 +1,6 @@
;
;Copyright 2016 Jay Sorg
+;Copyright 2017 mirabilos
;
;Permission to use, copy, modify, distribute, and sell this software and its
;documentation for any purpose is hereby granted without fee, provided that
@@ -21,8 +22,7 @@
%include "common.asm"
-section .data
- align 16
+PREPARE_RODATA
cw128 times 8 dw 128
cdFFFF times 4 dd 65535
; these are 1 << (factor - 1) 0 to 15 is factor
@@ -43,8 +43,6 @@ section .data
cwa8192 times 8 dw 8192 ; 14
cwa16384 times 8 dw 16384 ; 15
-section .text
-
%define LHI_ADD [esp + 1 * 16 + 4]
%define LHI_SFT [esp + 2 * 16 + 4]
%define LLO_ADD [esp + 3 * 16 + 4]
@@ -60,15 +58,15 @@ loop1a:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -82,8 +80,8 @@ loop1a:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -240,15 +238,15 @@ loop1c:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -262,8 +260,8 @@ loop1c:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -292,7 +290,7 @@ loop1c:
paddw xmm5, xmm1
psrldq xmm2, 14
- movd ebx, xmm2 ; save hi
+ movd ebp, xmm2 ; save hi
movdqa xmm6, xmm5 ; out lo
paddw xmm6, LLO_ADD
@@ -309,15 +307,15 @@ loop1c:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -331,8 +329,8 @@ loop1c:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -351,7 +349,7 @@ loop1c:
; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
movdqa xmm7, xmm5
pslldq xmm7, 2
- movd xmm6, ebx
+ movd xmm6, ebp
por xmm7, xmm6
paddw xmm5, xmm7
psraw xmm5, 1
@@ -392,15 +390,15 @@ loop1c1:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -414,8 +412,8 @@ loop1c1:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -444,7 +442,7 @@ loop1c1:
paddw xmm5, xmm1
psrldq xmm2, 14
- movd ebx, xmm2 ; save hi
+ movd ebp, xmm2 ; save hi
movdqa [edx], xmm5 ; out lo
@@ -458,15 +456,15 @@ loop1c1:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -480,8 +478,8 @@ loop1c1:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -500,7 +498,7 @@ loop1c1:
; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
movdqa xmm7, xmm5
pslldq xmm7, 2
- movd xmm6, ebx
+ movd xmm6, ebp
por xmm7, xmm6
paddw xmm5, xmm7
psraw xmm5, 1
@@ -635,15 +633,15 @@ loop1e:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -657,8 +655,8 @@ loop1e:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -687,7 +685,7 @@ loop1e:
paddw xmm5, xmm1
psrldq xmm2, 14
- movd ebx, xmm2 ; save hi
+ movd ebp, xmm2 ; save hi
movdqa xmm6, xmm5 ; out lo
paddw xmm6, LLO_ADD
@@ -707,15 +705,15 @@ loop2e:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -729,8 +727,8 @@ loop2e:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -750,14 +748,14 @@ loop2e:
; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
movdqa xmm7, xmm5
pslldq xmm7, 2
- movd xmm6, ebx
+ movd xmm6, ebp
por xmm7, xmm6
paddw xmm5, xmm7
psraw xmm5, 1
paddw xmm5, xmm1
psrldq xmm2, 14
- movd ebx, xmm2 ; save hi
+ movd ebp, xmm2 ; save hi
movdqa xmm6, xmm5 ; out lo
paddw xmm6, LLO_ADD
@@ -778,15 +776,15 @@ loop2e:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -800,8 +798,8 @@ loop2e:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -820,7 +818,7 @@ loop2e:
; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
movdqa xmm7, xmm5
pslldq xmm7, 2
- movd xmm6, ebx
+ movd xmm6, ebp
por xmm7, xmm6
paddw xmm5, xmm7
psraw xmm5, 1
@@ -861,15 +859,15 @@ loop1e1:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -883,8 +881,8 @@ loop1e1:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -913,7 +911,7 @@ loop1e1:
paddw xmm5, xmm1
psrldq xmm2, 14
- movd ebx, xmm2 ; save hi
+ movd ebp, xmm2 ; save hi
movdqa [edx], xmm5 ; out lo
@@ -930,15 +928,15 @@ loop2e1:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -952,8 +950,8 @@ loop2e1:
movd xmm5, eax
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -973,14 +971,14 @@ loop2e1:
; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
movdqa xmm7, xmm5
pslldq xmm7, 2
- movd xmm6, ebx
+ movd xmm6, ebp
por xmm7, xmm6
paddw xmm5, xmm7
psraw xmm5, 1
paddw xmm5, xmm1
psrldq xmm2, 14
- movd ebx, xmm2 ; save hi
+ movd ebp, xmm2 ; save hi
movdqa [edx], xmm5 ; out lo
@@ -998,15 +996,15 @@ loop2e1:
movdqa xmm2, [esi + 16]
movdqa xmm6, xmm1
movdqa xmm7, xmm2
- pand xmm1, [cdFFFF]
- pand xmm2, [cdFFFF]
+ pand xmm1, [lsym(cdFFFF)]
+ pand xmm2, [lsym(cdFFFF)]
packusdw xmm1, xmm2
movdqa xmm2, xmm6 ; src[2n + 1]
movdqa xmm3, xmm7
psrldq xmm2, 2
psrldq xmm3, 2
- pand xmm2, [cdFFFF]
- pand xmm3, [cdFFFF]
+ pand xmm2, [lsym(cdFFFF)]
+ pand xmm3, [lsym(cdFFFF)]
packusdw xmm2, xmm3
movdqa xmm3, xmm6 ; src[2n + 2]
movdqa xmm4, xmm7
@@ -1020,8 +1018,8 @@ loop2e1:
psrldq xmm5, 12
pslldq xmm5, 12
por xmm4, xmm5
- pand xmm3, [cdFFFF]
- pand xmm4, [cdFFFF]
+ pand xmm3, [lsym(cdFFFF)]
+ pand xmm4, [lsym(cdFFFF)]
packusdw xmm3, xmm4
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -1040,7 +1038,7 @@ loop2e1:
; l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1)
movdqa xmm7, xmm5
pslldq xmm7, 2
- movd xmm6, ebx
+ movd xmm6, ebp
por xmm7, xmm6
paddw xmm5, xmm7
psraw xmm5, 1
@@ -1080,9 +1078,9 @@ loop1f:
punpcklbw xmm1, xmm0
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
- psubw xmm1, [cw128]
- psubw xmm2, [cw128]
- psubw xmm3, [cw128]
+ psubw xmm1, [lsym(cw128)]
+ psubw xmm2, [lsym(cw128)]
+ psubw xmm3, [lsym(cw128)]
psllw xmm1, 5
psllw xmm2, 5
psllw xmm3, 5
@@ -1114,8 +1112,8 @@ loop2f:
movq xmm3, [esi + 64 * 1 * 2] ; src[2n + 2]
punpcklbw xmm2, xmm0
punpcklbw xmm3, xmm0
- psubw xmm2, [cw128]
- psubw xmm3, [cw128]
+ psubw xmm2, [lsym(cw128)]
+ psubw xmm3, [lsym(cw128)]
psllw xmm2, 5
psllw xmm3, 5
movdqa xmm4, xmm1
@@ -1147,7 +1145,7 @@ loop2f:
movdqa xmm1, xmm3 ; src[2n]
movq xmm2, [esi + 64 * 1] ; src[2n + 1]
punpcklbw xmm2, xmm0
- psubw xmm2, [cw128]
+ psubw xmm2, [lsym(cw128)]
psllw xmm2, 5
movdqa xmm4, xmm1
movdqa xmm5, xmm2
@@ -1188,7 +1186,7 @@ set_quants_hi:
movd xmm1, eax
movdqa LHI_SFT, xmm1
imul eax, 16
- lea edx, [cwa0]
+ lea edx, [lsym(cwa0)]
add edx, eax
movdqa xmm1, [edx]
movdqa LHI_ADD, xmm1
@@ -1199,7 +1197,7 @@ set_quants_lo:
movd xmm1, eax
movdqa LLO_SFT, xmm1
imul eax, 16
- lea edx, [cwa0]
+ lea edx, [lsym(cwa0)]
add edx, eax
movdqa xmm1, [edx]
movdqa LLO_ADD, xmm1
@@ -1231,6 +1229,7 @@ PROC rfxcodec_encode_dwt_shift_x86_sse41
movdqu [esp], xmm0
; save registers
push ebx
+ RETRIEVE_RODATA
push esi
push edi
push ebp
@@ -1385,5 +1384,4 @@ PROC rfxcodec_encode_dwt_shift_x86_sse41
; return value
mov eax, 0
ret
- align 16
-
+END_OF_FILE