Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHenrik Gramner <gramner@twoorioles.com>2022-09-15 14:51:54 +0300
committerHenrik Gramner <gramner@twoorioles.com>2022-09-19 18:09:41 +0300
commit49b1c3c54c0ddb61be1342a7783f127ea343fddf (patch)
treee4dec832e745e5285c5e4acdd561f2278992e7b8
parent0c8a3461b8253252ca400b0e2e06b4814e930006 (diff)
x86: Fix overflows in 12bpc AVX2 DC-only IDCT
Using smaller immediates also results in a small code size reduction in some cases, so apply those changes to the (10bpc-only) SSE code as well.
-rw-r--r--src/x86/itx16_avx2.asm298
-rw-r--r--src/x86/itx16_sse.asm199
2 files changed, 223 insertions, 274 deletions
diff --git a/src/x86/itx16_avx2.asm b/src/x86/itx16_avx2.asm
index fa48145..8d68ce4 100644
--- a/src/x86/itx16_avx2.asm
+++ b/src/x86/itx16_avx2.asm
@@ -287,37 +287,40 @@ ALIGN function_align
%endif
%endmacro
-%macro INV_TXFM_4X4_FN 2 ; type1, type2
- INV_TXFM_FN %1, %2, 0, 4x4
-%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
- movd xm1, [pw_2896x8]
- mov [cq], eobd ; 0
- add r6d, 2048
- sar r6d, 12
- movd xm0, r6d
- packssdw xm0, xm0
- pmulhrsw xm0, xm1
- vpbroadcastw xm0, xm0
- mova xm1, xm0
- jmp m(iadst_4x4_internal_10bpc).end
-%endif
-%endmacro
-
-%macro INV_TXFM_4X4_12BPC_FN 2 ; type1, type2
- INV_TXFM_FN %1, %2, 0, 4x4, 12
+%macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 4x4, %3
%ifidn %1_%2, dct_dct
+ vpbroadcastd xm3, [pixel_%3bpc_max]
+%if %3 = 10
+.dconly:
imul r6d, [cq], 181
mov [cq], eobd ; 0
+ mov r3d, 4
+.dconly2:
add r6d, 128
sar r6d, 8
+.dconly3:
imul r6d, 181
- add r6d, 128
- sar r6d, 8
+ add r6d, 2176
+ sar r6d, 12
movd xm0, r6d
- vpbroadcastd m0, xm0
- mova m1, m0
- jmp m(iadst_4x4_internal_12bpc).end
+ vpbroadcastw xm0, xm0
+ pxor xm2, xm2
+.dconly_loop:
+ movq xm1, [dstq+strideq*0]
+ movhps xm1, [dstq+strideq*1]
+ paddw xm1, xm0
+ pmaxsw xm1, xm2
+ pminsw xm1, xm3
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ WRAP_XMM RET
+%else
+ jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly
+%endif
%endif
%endmacro
@@ -556,10 +559,10 @@ cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
movhps [r6 +strideq*1], xm1
RET
-INV_TXFM_4X4_12BPC_FN dct, dct
-INV_TXFM_4X4_12BPC_FN dct, identity
-INV_TXFM_4X4_12BPC_FN dct, adst
-INV_TXFM_4X4_12BPC_FN dct, flipadst
+INV_TXFM_4X4_FN dct, dct, 12
+INV_TXFM_4X4_FN dct, identity, 12
+INV_TXFM_4X4_FN dct, adst, 12
+INV_TXFM_4X4_FN dct, flipadst, 12
cglobal idct_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
call m(idct_4x4_internal_10bpc).main
@@ -576,10 +579,10 @@ cglobal idct_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
vpermq m1, m1, q2031
jmp m(iadst_4x4_internal_12bpc).end
-INV_TXFM_4X4_12BPC_FN adst, dct
-INV_TXFM_4X4_12BPC_FN adst, adst
-INV_TXFM_4X4_12BPC_FN adst, flipadst
-INV_TXFM_4X4_12BPC_FN adst, identity
+INV_TXFM_4X4_FN adst, dct, 12
+INV_TXFM_4X4_FN adst, adst, 12
+INV_TXFM_4X4_FN adst, flipadst, 12
+INV_TXFM_4X4_FN adst, identity, 12
cglobal iadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
call m(iadst_4x4_internal_10bpc).main
@@ -628,10 +631,10 @@ cglobal iadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
movhps [r6 +strideq*1], xm1
RET
-INV_TXFM_4X4_12BPC_FN flipadst, dct
-INV_TXFM_4X4_12BPC_FN flipadst, adst
-INV_TXFM_4X4_12BPC_FN flipadst, flipadst
-INV_TXFM_4X4_12BPC_FN flipadst, identity
+INV_TXFM_4X4_FN flipadst, dct, 12
+INV_TXFM_4X4_FN flipadst, adst, 12
+INV_TXFM_4X4_FN flipadst, flipadst, 12
+INV_TXFM_4X4_FN flipadst, identity, 12
cglobal iflipadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
call m(iadst_4x4_internal_10bpc).main
@@ -652,10 +655,10 @@ cglobal iflipadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
mova m1, m2
jmp m(iadst_4x4_internal_12bpc).end
-INV_TXFM_4X4_12BPC_FN identity, dct
-INV_TXFM_4X4_12BPC_FN identity, adst
-INV_TXFM_4X4_12BPC_FN identity, flipadst
-INV_TXFM_4X4_12BPC_FN identity, identity
+INV_TXFM_4X4_FN identity, dct, 12
+INV_TXFM_4X4_FN identity, adst, 12
+INV_TXFM_4X4_FN identity, flipadst, 12
+INV_TXFM_4X4_FN identity, identity, 12
cglobal iidentity_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
mova m1, [itx4_shuf]
@@ -690,33 +693,13 @@ cglobal iidentity_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
vpbroadcastd xm3, [pixel_%3bpc_max]
%if %3 = 10
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- add r6d, 2048
- sar r6d, 12
-.end:
- imul r6d, 2896
- add r6d, 34816
- sar r6d, 16
- movd xm0, r6d
- vpbroadcastw xm0, xm0
- pxor xm2, xm2
-.end_loop:
- movq xm1, [dstq+strideq*0]
- movhps xm1, [dstq+strideq*1]
- paddw xm1, xm0
- pmaxsw xm1, xm2
- pminsw xm1, xm3
- movq [dstq+strideq*0], xm1
- movhps [dstq+strideq*1], xm1
- lea dstq, [dstq+strideq*2]
- sub r3d, 2
- jg .end_loop
- WRAP_XMM RET
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2
%else
jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly
%endif
@@ -1188,12 +1171,12 @@ cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, 0, 4x16, %3
%ifidn %1_%2, dct_dct
vpbroadcastd xm3, [pixel_%3bpc_max]
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
- add r6d, 6144
- sar r6d, 13
- jmp m(inv_txfm_add_dct_dct_4x8_10bpc).end
+ add r6d, 384
+ sar r6d, 9
+ jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3
%endif
%endmacro
@@ -1834,39 +1817,18 @@ cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
%macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth
INV_TXFM_FN %1, %2, 0, 8x4, %3
%ifidn %1_%2, dct_dct
- vpbroadcastd m4, [pixel_%3bpc_max]
+ vpbroadcastd m3, [pixel_%3bpc_max]
%if %3 = 10
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- add r6d, 34816
- sar r6d, 16
- movd xm0, r6d
- vpbroadcastw m0, xm0
-.end:
- pxor m3, m3
- mova xm1, [dstq+strideq*0]
- vinserti128 m1, [dstq+strideq*1], 1
- lea r6, [dstq+strideq*2]
- mova xm2, [r6 +strideq*0]
- vinserti128 m2, [r6 +strideq*1], 1
- paddw m1, m0
- paddw m2, m0
- pmaxsw m1, m3
- pmaxsw m2, m3
- pminsw m1, m4
- pminsw m2, m4
- mova [dstq+strideq*0], xm1
- vextracti128 [dstq+strideq*1], m1, 1
- mova [r6 +strideq*0], xm2
- vextracti128 [r6 +strideq*1], m2, 1
- RET
+ mov r3d, 4
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
%else
jmp m(inv_txfm_add_dct_dct_8x4_10bpc).dconly
%endif
@@ -2244,16 +2206,16 @@ cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
vpbroadcastd m3, [pixel_%3bpc_max]
%if %3 = 10
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
.dconly2:
- add r6d, 6144
- sar r6d, 13
+ add r6d, 384
+ sar r6d, 9
.dconly3:
- imul r6d, 2896
- add r6d, 34816
- sar r6d, 16
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
movd xm0, r6d
vpbroadcastw m0, xm0
pxor m2, m2
@@ -2779,12 +2741,12 @@ cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, %3, 8x16, %4
%ifidn %1_%2, dct_dct
vpbroadcastd m3, [pixel_%4bpc_max]
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
%endif
%endmacro
@@ -3426,16 +3388,16 @@ ALIGN function_align
vpbroadcastd m4, [pixel_%3bpc_max]
%if %3 = 10
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 4
.dconly2:
- add r6d, 6144
- sar r6d, 13
+ add r6d, 384
+ sar r6d, 9
.dconly3:
- imul r6d, 2896
- add r6d, 34816
- sar r6d, 16
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
movd xm0, r6d
vpbroadcastw m0, xm0
pxor m3, m3
@@ -3946,12 +3908,12 @@ cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, 0, 16x8, %3
%ifidn %1_%2, dct_dct
vpbroadcastd m4, [pixel_%3bpc_max]
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
%endif
%endmacro
@@ -4590,11 +4552,11 @@ cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, %3, 16x16, %4
%ifidn %1_%2, dct_dct
vpbroadcastd m4, [pixel_%4bpc_max]
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
- add r6d, 10240
- sar r6d, 14
+ add r6d, 640
+ sar r6d, 10
jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3
%endif
%endmacro
@@ -5748,11 +5710,11 @@ cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob
RET
.dconly:
vpbroadcastd m3, [pixel_10bpc_max]
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
- add r6d, 10240
- sar r6d, 14
+ add r6d, 640
+ sar r6d, 10
jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
ALIGN function_align
.pass1_main_part1:
@@ -6294,11 +6256,11 @@ cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob
RET
.dconly:
vpbroadcastd m3, [pixel_12bpc_max]
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
- add r6d, 10240
- sar r6d, 14
+ add r6d, 640
+ sar r6d, 10
jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
ALIGN function_align
.pass1_main:
@@ -6363,17 +6325,17 @@ cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob
cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jnz .full
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
vpbroadcastd m4, [pixel_10bpc_max]
.dconly:
- add r6d, 10240
- sar r6d, 14
+ add r6d, 640
+ sar r6d, 10
.dconly2:
- imul r6d, 2896
- add r6d, 34816
- sar r6d, 16
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
movd xm0, r6d
vpbroadcastw m0, xm0
pxor m3, m3
@@ -6499,7 +6461,7 @@ cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob
cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jnz .full
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
vpbroadcastd m4, [pixel_12bpc_max]
@@ -6581,12 +6543,12 @@ cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob
jmp .fast
.dconly:
vpbroadcastd m4, [pixel_10bpc_max]
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
.eob44:
mova [r4+16*0], xm0
@@ -6932,14 +6894,14 @@ cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
jmp .end
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- add r6d, 6144
- sar r6d, 13
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
vpbroadcastd m4, [pixel_10bpc_max]
jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2
.full:
@@ -7203,7 +7165,7 @@ cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
call .main
jmp .pass2
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
vpbroadcastd m4, [pixel_10bpc_max]
@@ -7482,11 +7444,11 @@ cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
jmp .pass2
.dconly:
vpbroadcastd m4, [pixel_10bpc_max]
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
- add r6d, 10240
- sar r6d, 14
+ add r6d, 640
+ sar r6d, 10
jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3
.fast:
lea r4, [rsp+32*38]
@@ -7821,14 +7783,14 @@ cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob
call .main
jmp .pass2
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- add r6d, 6144
- sar r6d, 13
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
vpbroadcastd m4, [pixel_10bpc_max]
jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2
.fast:
@@ -8004,16 +7966,16 @@ ALIGN function_align
cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jnz .normal
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
.dconly:
- add r6d, 10240
- sar r6d, 14
+ add r6d, 640
+ sar r6d, 10
.dconly2:
- imul r6d, 2896
- add r6d, 34816
- sar r6d, 16
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
movd xm0, r6d
%if WIN64
movaps [rsp+8], xmm6
@@ -8278,14 +8240,14 @@ cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob
call .main
jmp .pass2
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- add r6d, 6144
- sar r6d, 13
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2
.fast:
pxor m0, m0
@@ -8427,7 +8389,7 @@ cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob
call .main
jmp .pass2
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly
diff --git a/src/x86/itx16_sse.asm b/src/x86/itx16_sse.asm
index ee5bfa5..3833e17 100644
--- a/src/x86/itx16_sse.asm
+++ b/src/x86/itx16_sse.asm
@@ -361,18 +361,32 @@ ALIGN function_align
%macro INV_TXFM_4X4_FN 2 ; type1, type2
INV_TXFM_FN %1, %2, 0, 4x4
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
- movd m1, [o(pw_2896x8)]
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- add r5d, 2048
- sar r5d, 12
+ mov r3d, 4
+.dconly:
+ add r5d, 128
+ sar r5d, 8
+.dconly2:
+ imul r5d, 2896
+ mova m2, [o(pixel_10bpc_max)]
+ add r5d, 34816
movd m0, r5d
- packssdw m0, m0
- pmulhrsw m0, m1
- pshuflw m0, m0, q0000
+ pshuflw m0, m0, q1111
+ pxor m3, m3
punpcklqdq m0, m0
- mova m1, m0
- TAIL_CALL m(iadst_4x4_internal_16bpc).end
+.dconly_loop:
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ paddw m1, m0
+ pminsw m1, m2
+ pmaxsw m1, m3
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
%endif
%endmacro
@@ -662,40 +676,13 @@ cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset
INV_TXFM_FN %1, %2, %3, 4x8
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- mov r3d, 2
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 2048
- sar r5d, 12
-.end:
- imul r5d, 2896
- add r5d, 34816
- movd m0, r5d
- pshuflw m0, m0, q1111
- punpcklqdq m0, m0
- pxor m4, m4
- mova m3, [o(pixel_10bpc_max)]
- lea r2, [strideq*3]
-.loop:
- movq m1, [dstq+strideq*0]
- movq m2, [dstq+strideq*2]
- movhps m1, [dstq+strideq*1]
- movhps m2, [dstq+r2]
- paddw m1, m0
- paddw m2, m0
- REPX {pminsw x, m3}, m1, m2
- REPX {pmaxsw x, m4}, m1, m2
- movq [dstq+strideq*0], m1
- movhps [dstq+strideq*1], m1
- movq [dstq+strideq*2], m2
- movhps [dstq+r2 ], m2
- lea dstq, [dstq+strideq*4]
- dec r3d
- jg .loop
- RET
+ mov r3d, 8
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly
%endif
%endmacro
@@ -944,12 +931,12 @@ cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- mov r3d, 4
- add r5d, 6144
- sar r5d, 13
- jmp m(inv_txfm_add_dct_dct_4x8_16bpc).end
+ mov r3d, 16
+ add r5d, 384
+ sar r5d, 9
+ jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2
%endif
%endmacro
@@ -1297,13 +1284,13 @@ cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 2048
- sar r5d, 12
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 128
+ sar r5d, 8
imul r5d, 2896
add r5d, 34816
movd m0, r5d
@@ -1783,12 +1770,12 @@ cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 2
.end:
- add r5d, 6144
- sar r5d, 13
+ add r5d, 384
+ sar r5d, 9
.end2:
imul r5d, 2896
add r5d, 34816
@@ -2146,11 +2133,11 @@ cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
mov r3d, 4
%if stack_size_padded > 0
; adjust to caller's stack allocation
@@ -2477,12 +2464,12 @@ cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 4
.dconly:
- add r5d, 6144
- sar r5d, 13
+ add r5d, 384
+ sar r5d, 9
.dconly2:
imul r5d, 2896
add r5d, 34816
@@ -3482,12 +3469,12 @@ cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
%if ARCH_X86_32
add rsp, 1*16
%endif
@@ -3949,11 +3936,11 @@ cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
add rsp, (5+ARCH_X86_64*3+WIN64)*16
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
%endif
@@ -5182,11 +5169,11 @@ cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \
call m(idct_8x8_internal_16bpc).round1_and_write_8x8
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
add rsp, (31+2*ARCH_X86_64)*16
jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2
@@ -5359,12 +5346,12 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \
%endif
RET
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
add rsp, (65+4*ARCH_X86_64)*16
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
@@ -6069,12 +6056,12 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
%endif
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
.dconly1:
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
.dconly2:
imul r5d, 2896
add r5d, 34816
@@ -6368,14 +6355,14 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
%endif
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 6144
- sar r5d, 13
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
@@ -6589,7 +6576,7 @@ cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
add rsp, (5*32+1-(24+8*ARCH_X86_32))*16
@@ -6862,11 +6849,11 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
@@ -7122,14 +7109,14 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \
jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 6144
- sar r5d, 13
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16
jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
@@ -7660,12 +7647,12 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
.dconly1:
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
.dconly2:
imul r5d, 2896
add r5d, 34816
@@ -7904,14 +7891,14 @@ cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 6144
- sar r5d, 13
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
add rsp, (1+8*32+1*WIN64)*16
jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2
@@ -8140,7 +8127,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \