Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2021-08-10 17:22:21 +0300
committerRonald S. Bultje <rsbultje@gmail.com>2021-08-12 21:47:14 +0300
commit5455e8250cee507b9b7e7cead767ce577bb9106a (patch)
tree84b23a216b617aba6667c0e59c7c00f9ffa09820
parent9cf9d4a61323696821809c98f4e5cdc96639cb68 (diff)
x86/itx: share pass2 loop between {16,32}x32 dct^2 functions
-rw-r--r--src/x86/itx16_sse.asm84
1 files changed, 35 insertions, 49 deletions
diff --git a/src/x86/itx16_sse.asm b/src/x86/itx16_sse.asm
index 5a03563..161416d 100644
--- a/src/x86/itx16_sse.asm
+++ b/src/x86/itx16_sse.asm
@@ -5305,15 +5305,13 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \
jge .loop_pass1
; pass=2
+ add rsp, 9*16
%if ARCH_X86_64
mov r6, dstq
-%if WIN64
- mov r7, [rsp+gprsize*1+76*16]
-%endif
%else
- mov dstq, [rsp+gprsize*1+76*16]
+ mov dstq, [rsp+gprsize*1+67*16]
%endif
- mov eobd, [rsp+gprsize*0+76*16]
+ mov eobd, [rsp+gprsize*0+67*16]
cmp eobd, 44
jl .load_veryfast
cmp eobd, 151
@@ -5328,21 +5326,38 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \
lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
; fall-through
.run:
- add rsp, 9*16
- call m(inv_txfm_add_dct_dct_8x32_16bpc).pass2
%if ARCH_X86_64
- lea dstq, [r6+16]
+ lea r2, [dstq+32]
+ mov r7, -4
%else
- mov dstq, [rsp+gprsize*1+67*16]
- add dstq, 16
+ lea r2, [rsp+67*16]
+ mov dword [r2+0*gprsize], 2
%endif
- add rsp, 32*16
+ jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
+.loop_pass2:
mova m0, [rsp+16* 3]
+.loop_pass2_entry:
+%if ARCH_X86_32
+ mov dstq, [r2+1*gprsize]
+%endif
call m(inv_txfm_add_dct_dct_8x32_16bpc).pass2
-%assign stack_size (stack_size-41*16)
+ add rsp, 32*16
+%if ARCH_X86_64
+ add r7, 2
+ lea dstq, [r2+r7*8]
+ jl .loop_pass2
+%if WIN64
+ mov r7, [rsp+gprsize*1+3*16]
+%endif
+%else
+ add dword [r2+1*gprsize], 16
+ dec dword [r2+0*gprsize]
+ jg .loop_pass2
+%endif
+%assign stack_size (stack_size-73*16)
%if STACK_ALIGNMENT >= 16
-%assign stack_size_padded (stack_size_padded-41*16)
-%assign stack_offset (stack_offset-41*16)
+%assign stack_size_padded (stack_size_padded-73*16)
+%assign stack_offset (stack_offset-73*16)
%else
%xdefine rstkm [rsp + stack_size]
%endif
@@ -6375,9 +6390,9 @@ cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
; remove entirely-zero iterations
%if ARCH_X86_32
- mov [rsp+5*32*16+0*gprsize], dstq
+ mov [rsp+5*32*16+1*gprsize], dstq
%elif WIN64
- mov [rsp+5*32*16+0*gprsize], r7
+ mov [rsp+5*32*16+1*gprsize], r7
%endif
%undef cmp
mov r5d, 14
@@ -6410,7 +6425,7 @@ cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
.end_zero_loop:
; actual first pass after skipping all-zero data
- mov [rsp+gprsize*1+5*32*16], eobd
+ mov [rsp+gprsize*0+5*32*16], eobd
.loop_pass1:
mova m0, [cq+128* 1+r5*8]
mova m1, [cq+128* 7+r5*8]
@@ -6552,7 +6567,7 @@ cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
jge .loop_pass1
; pass=2 code starts here
- mov eobd, [rsp+gprsize*1+5*32*16]
+ mov eobd, [rsp+gprsize*0+5*32*16]
add rsp, 29*16
cmp eobd, 36
jl .load_veryfast
@@ -6573,38 +6588,9 @@ cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
mov r7, -8
%else
lea r2, [rsp+(4*32+3)*16]
- mov dword [r2+1*gprsize], 4
+ mov dword [r2+0*gprsize], 4
%endif
- jmp .loop_pass2_entry
-.loop_pass2:
- mova m0, [rsp+16* 3]
-.loop_pass2_entry:
-%if ARCH_X86_32
- mov dstq, [r2+0*gprsize]
-%endif
- call m(inv_txfm_add_dct_dct_8x32_16bpc).pass2
- add rsp, 32*16
-%if ARCH_X86_64
- add r7, 2
- lea dstq, [r2+r7*8]
- jl .loop_pass2
-%else
- mov dstq, [r2+0*gprsize]
- add dword [r2+0*gprsize], 16
- dec dword [r2+1*gprsize]
- jg .loop_pass2
-%endif
-%assign stack_size (stack_size-(4*32+29)*16)
-%if STACK_ALIGNMENT >= 16
-%assign stack_size_padded (stack_size_padded-(4*32+29)*16)
-%assign stack_offset (stack_offset-(4*32+29)*16)
-%else
-%xdefine rstkm [rsp + stack_size]
-%endif
-%if WIN64
- mov r7, [rsp+3*16+gprsize*0]
-%endif
- RET
+ jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
.dconly:
imul r5d, [cq], 2896