From 345127a79532a679de025b031bdbc264df5f17ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Fri, 9 Sep 2022 12:16:03 +0300 Subject: arm: itx: Add clipping to row_clip_min/max in the 10 bpc codepaths This fixes conformance with the argon test samples, in particular with these samples: profile0_core/streams/test10100_579_8614.obu profile0_core/streams/test10218_6914.obu This gives a pretty notable slowdown to these transforms - some examples: Before: Cortex A53 A72 A73 Apple M1 inv_txfm_add_8x8_dct_dct_1_10bpc_neon: 365.7 290.2 299.8 0.3 inv_txfm_add_16x16_dct_dct_2_10bpc_neon: 1865.2 1384.1 1457.5 2.6 inv_txfm_add_64x64_dct_dct_4_10bpc_neon: 33976.3 26817.0 24864.2 40.4 After: inv_txfm_add_8x8_dct_dct_1_10bpc_neon: 397.7 322.2 335.1 0.4 inv_txfm_add_16x16_dct_dct_2_10bpc_neon: 2121.9 1336.7 1664.6 2.6 inv_txfm_add_64x64_dct_dct_4_10bpc_neon: 38569.4 27622.6 28176.0 51.0 Thus, for the transforms alone, it makes them around 10-13% slower (the Apple M1 measurements are too noisy to be conclusive here). Measured on actual full decoding, it makes decoding of 10 bpc Chimera around maybe 1% slower on an Apple M1 - close to measurement noise anyway. --- src/arm/32/itx16.S | 239 +++++++++++++++++++++++++++++++++++---- src/arm/64/itx16.S | 320 +++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 470 insertions(+), 89 deletions(-) diff --git a/src/arm/32/itx16.S b/src/arm/32/itx16.S index db8ecff..aa6c272 100644 --- a/src/arm/32/itx16.S +++ b/src/arm/32/itx16.S @@ -668,12 +668,21 @@ def_fn_4x4 identity, flipadst .macro idct_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7 idct_4s_x4 \r0, \r2, \r4, \r6 + vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, \r0, \r2, \r4, \r6 + vmin.s32 \r, \r, q5 +.endr +.irp r, \r0, \r2, \r4, \r6 + vmax.s32 \r, \r, q4 +.endr + vmul_vmls q2, \r1, \r7, d2[0], d2[1] // -> t4a - vmul_vmla q4, \r1, \r7, d2[1], d2[0] // -> t7a + vmul_vmla q3, \r1, \r7, d2[1], d2[0] // -> t7a vmul_vmls q6, \r5, \r3, d3[0], d3[1] // -> t5a vmul_vmla q7, \r5, \r3, d3[1], d3[0] // -> t6a vrshr.s32 \r1, q2, #12 // t4a - vrshr.s32 \r7, q4, #12 // t7a + vrshr.s32 \r7, q3, #12 // t7a vrshr.s32 \r3, q6, #12 // t5a vrshr.s32 \r5, q7, #12 // t6a @@ -682,17 +691,24 @@ def_fn_4x4 identity, flipadst vqadd.s32 q3, \r7, \r5 // t7 vqsub.s32 \r3, \r7, \r5 // t6a - vmul_vmls q4, \r3, \r1, d0[0], d0[0] // -> t5 +.irp r, q2, \r1, q3, \r3 + vmin.s32 \r, \r, q5 +.endr +.irp r, q2, \r1, q3, \r3 + vmax.s32 \r, \r, q4 +.endr + + vmul_vmls q7, \r3, \r1, d0[0], d0[0] // -> t5 vmul_vmla q6, \r3, \r1, d0[0], d0[0] // -> t6 - vrshr.s32 q4, q4, #12 // t5 + vrshr.s32 q7, q7, #12 // t5 vrshr.s32 q5, q6, #12 // t6 vqsub.s32 \r7, \r0, q3 // out7 vqadd.s32 \r0, \r0, q3 // out0 vqadd.s32 \r1, \r2, q5 // out1 vqsub.s32 q6, \r2, q5 // out6 - vqadd.s32 \r2, \r4, q4 // out2 - vqsub.s32 \r5, \r4, q4 // out5 + vqadd.s32 \r2, \r4, q7 // out2 + vqsub.s32 \r5, \r4, q7 // out5 vqadd.s32 \r3, \r6, q2 // out3 vqsub.s32 \r4, \r6, q2 // out4 vmov \r6, q6 // out6 @@ -701,6 +717,15 @@ def_fn_4x4 identity, flipadst .macro idct_2s_x8 r0, r1, r2, r3, r4, r5, r6, r7 idct_2s_x4 \r0, \r2, \r4, \r6 + vmov.i32 d9, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 d8, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, \r0, \r2, \r4, \r6 + vmin.s32 \r, \r, d9 +.endr +.irp r, \r0, \r2, \r4, \r6 + vmax.s32 \r, \r, d8 +.endr + vmul_vmls d4, \r1, \r7, d2[0], d2[1] // -> t4a vmul_vmla d5, \r1, \r7, d2[1], d2[0] // -> t7a vmul_vmls d6, \r5, \r3, d3[0], d3[1] // -> t5a @@ -715,6 +740,13 @@ def_fn_4x4 identity, flipadst vqadd.s32 d5, \r7, \r5 // t7 vqsub.s32 \r3, \r7, \r5 // t6a +.irp r, d4, \r1, d5, \r3 + vmin.s32 \r, \r, d9 +.endr +.irp r, d4, \r1, d5, \r3 + vmax.s32 \r, \r, d8 +.endr + vmul_vmls d6, \r3, \r1, d0[0], d0[0] // -> t5 vmul_vmla d7, \r3, \r1, d0[0], d0[0] // -> t6 vrshr.s32 d6, d6, #12 // t5 @@ -763,19 +795,28 @@ endfunc vqadd.s32 q2, q8, q12 // t0 vqsub.s32 q3, q8, q12 // t4 + vmov.i32 q12, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vqadd.s32 q4, q15, q11 // t1 vqsub.s32 q5, q15, q11 // t5 vqadd.s32 q6, q10, q14 // t2 vqsub.s32 q7, q10, q14 // t6 + vmvn.i32 q14, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 vqadd.s32 q10, q13, q9 // t3 vqsub.s32 q11, q13, q9 // t7 +.irp r, q2, q3, q4, q5, q6, q7, q10, q11 + vmin.s32 \r, \r, q12 +.endr +.irp r, q2, q3, q4, q5, q6, q7, q10, q11 + vmax.s32 \r, \r, q14 +.endr + vmul_vmla q8, q3, q5, d1[1], d1[0] - vmul_vmls q12, q3, q5, d1[0], d1[1] + vmul_vmls q13, q3, q5, d1[0], d1[1] vmul_vmls q14, q11, q7, d1[1], d1[0] vrshr.s32 q3, q8, #12 // t4a - vrshr.s32 q5, q12, #12 // t5a + vrshr.s32 q5, q13, #12 // t5a vmul_vmla q8, q11, q7, d1[0], d1[1] @@ -786,12 +827,24 @@ endfunc vqsub.s32 q2, q2, q6 // t2 vqadd.s32 \r7, q4, q10 // out7 vqsub.s32 q4, q4, q10 // t3 - vqneg.s32 \r7, \r7 // out7 + + vmvn.i32 q10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 vqadd.s32 \r1, q3, q7 // out1 vqsub.s32 q3, q3, q7 // t6 vqadd.s32 \r6, q5, q11 // out6 vqsub.s32 q5, q5, q11 // t7 + + // Not clipping the output registers, as they will be downshifted and + // narrowed afterwards anyway. +.irp r, q2, q4, q3, q5 + vmin.s32 \r, \r, q12 +.endr +.irp r, q2, q4, q3, q5 + vmax.s32 \r, \r, q10 +.endr + + vqneg.s32 \r7, \r7 // out7 vqneg.s32 \r1, \r1 // out1 vmul_vmla q10, q2, q4, d0[0], d0[0] // -> out3 (q11 or q12) @@ -1068,6 +1121,14 @@ function inv_dct_2s_x16_neon idct_2s_x8 d16, d18, d20, d22, d24, d26, d28, d30 + // idct_8 leaves the row_clip_max/min constants in d9 and d8 +.irp r, d16, d18, d20, d22, d24, d26, d28, d30 + vmin.s32 \r, \r, d9 +.endr +.irp r, d16, d18, d20, d22, d24, d26, d28, d30 + vmax.s32 \r, \r, d8 +.endr + vld1.32 {q0, q1}, [r12, :128] sub r12, r12, #32 @@ -1099,6 +1160,13 @@ function inv_dct_2s_x16_neon vqadd.s32 d25, d29, d27 // t12 vqsub.s32 d29, d29, d27 // t13 +.irp r, d4, d17, d5, d31, d23, d19, d25, d29 + vmin.s32 \r, \r, d9 +.endr +.irp r, d4, d17, d5, d31, d23, d19, d25, d29 + vmax.s32 \r, \r, d8 +.endr + vmul_vmls d6, d5, d4, d1[0], d1[1] // -> t9a vmul_vmla d7, d5, d4, d1[1], d1[0] // -> t14a vrshr.s32 d21, d6, #12 // t9a @@ -1119,6 +1187,13 @@ function inv_dct_2s_x16_neon vqsub.s32 d25, d27, d29 // t13 vqadd.s32 d27, d27, d29 // t14 +.irp r, d4, d17, d5, d31, d19, d21, d25, d27 + vmin.s32 \r, \r, d9 +.endr +.irp r, d4, d17, d5, d31, d19, d21, d25, d27 + vmax.s32 \r, \r, d8 +.endr + vmul_vmls d6, d5, d4, d0[0], d0[0] // -> t11 vmul_vmla d7, d5, d4, d0[0], d0[0] // -> t12 vmul_vmls d4, d25, d21, d0[0], d0[0] // -> t10a @@ -1193,6 +1268,9 @@ endfunc vld1.32 {q0, q1}, [r12, :128] + vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 + vqsub.s32 d5, d16, d23 // t8a vqadd.s32 d16, d16, d23 // t0a vqsub.s32 d7, d31, d24 // t9a @@ -1210,6 +1288,13 @@ endfunc vqadd.s32 d28, d25, d30 // t7a vqsub.s32 d25, d25, d30 // t15a +.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25 + vmin.s32 \r, \r, d11 +.endr +.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25 + vmax.s32 \r, \r, d10 +.endr + vmul_vmla d4, d5, d7, d2[1], d2[0] // -> t8 vmul_vmls d6, d5, d7, d2[0], d2[1] // -> t9 vmul_vmla d8, d18, d29, d3[1], d3[0] // -> t10 @@ -1244,6 +1329,13 @@ endfunc vqadd.s32 d20, d29, d22 // t11a vqsub.s32 d29, d29, d22 // t15a +.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29 + vmin.s32 \r, \r, d11 +.endr +.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29 + vmax.s32 \r, \r, d10 +.endr + vmul_vmla d4, d2, d3, d1[1], d1[0] // -> t4a vmul_vmls d6, d2, d3, d1[0], d1[1] // -> t5a vmul_vmls d8, d24, d23, d1[1], d1[0] // -> t6a @@ -1272,24 +1364,34 @@ endfunc vqadd.s32 \o15,d31, d26 // out15 vmov \o0, d4 .endif - vqneg.s32 \o15, \o15 // out15 vqsub.s32 d3, d29, d18 // t15a vqadd.s32 \o13,d29, d18 // out13 vqadd.s32 \o2, d17, d30 // out2 vqsub.s32 d26, d17, d30 // t14a - vqneg.s32 \o13,\o13 // out13 vqadd.s32 \o1, d19, d27 // out1 vqsub.s32 d27, d19, d27 // t10 vqadd.s32 \o14,d28, d20 // out14 vqsub.s32 d20, d28, d20 // t11 - vqneg.s32 \o1, \o1 // out1 vqadd.s32 \o3, d22, d24 // out3 vqsub.s32 d22, d22, d24 // t6 vqadd.s32 \o12,d25, d23 // out12 vqsub.s32 d23, d25, d23 // t7 + + // Not clipping the output registers, as they will be downshifted and + // narrowed afterwards anyway. +.irp r, d2, d21, d3, d26, d27, d20, d22, d23 + vmin.s32 \r, \r, d11 +.endr +.irp r, d2, d21, d3, d26, d27, d20, d22, d23 + vmax.s32 \r, \r, d10 +.endr + + vqneg.s32 \o15, \o15 // out15 + vqneg.s32 \o13,\o13 // out13 + vqneg.s32 \o1, \o1 // out1 vqneg.s32 \o3, \o3 // out3 vmul_vmls d24, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23) @@ -1947,6 +2049,9 @@ function inv_dct32_odd_2s_x16_neon vld1.32 {q0, q1}, [r12, :128] + vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 + vqsub.s32 d5, d16, d24 // t17 vqadd.s32 d16, d16, d24 // t16 vqsub.s32 d7, d31, d23 // t30 @@ -1964,6 +2069,13 @@ function inv_dct32_odd_2s_x16_neon vqadd.s32 d25, d19, d27 // t28 vqsub.s32 d19, d19, d27 // t29 +.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19 + vmin.s32 \r, \r, d11 +.endr +.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19 + vmax.s32 \r, \r, d10 +.endr + vmul_vmls d4, d7, d5, d2[0], d2[1] // -> t17a vmul_vmla d6, d7, d5, d2[1], d2[0] // -> t30a vmul_vmla d8, d19, d24, d2[1], d2[0] // -> t18a @@ -2000,6 +2112,13 @@ function inv_dct32_odd_2s_x16_neon vqsub.s32 d29, d31, d25 // t28a vqadd.s32 d31, d31, d25 // t31a +.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31 + vmin.s32 \r, \r, d11 +.endr +.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31 + vmax.s32 \r, \r, d10 +.endr + vmul_vmls d4, d2, d3, d1[0], d1[1] // -> t18a vmul_vmla d6, d2, d3, d1[1], d1[0] // -> t29a vmul_vmls d8, d29, d24, d1[0], d1[1] // -> t19 @@ -2037,6 +2156,13 @@ function inv_dct32_odd_2s_x16_neon vqsub.s32 d24, d24, d19 // t27a vmov d19, d4 // out19 +.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24 + vmin.s32 \r, \r, d11 +.endr +.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24 + vmax.s32 \r, \r, d10 +.endr + vmul_vmls d4, d24, d26, d0[0], d0[0] // -> t20 vmul_vmla d6, d24, d26, d0[0], d0[0] // -> t27 vrshr.s32 d20, d4, #12 // t20 @@ -2081,6 +2207,18 @@ function inv_txfm_horz\suffix\()_dct_32x2_neon scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 .endif bl inv_dct_2s_x16_neon + + // idct_16 leaves the row_clip_max/min constants in d9 and d8, + // but here we want to use full q registers for clipping. + vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, q8, q9, q10, q11, q12, q13, q14, q15 + vmin.s32 \r, \r, q3 +.endr +.irp r, q8, q9, q10, q11, q12, q13, q14, q15 + vmax.s32 \r, \r, q2 +.endr + vtrn.32 d16, d17 vtrn.32 d18, d19 vtrn.32 d20, d21 @@ -2745,14 +2883,21 @@ function inv_dct64_step1_neon vqsub.s32 d30, d23, d22 // t62 vqadd.s32 d31, d23, d22 // t63 +.irp r, q12, q13, q14, q15 + vmin.s32 \r, \r, q5 +.endr +.irp r, q12, q13, q14, q15 + vmax.s32 \r, \r, q4 +.endr + vmul_vmla d4, d29, d26, d0[0], d0[1] // -> t34a vmul_vmls d6, d29, d26, d0[1], d0[0] // -> t61a vneg.s32 d4, d4 // t34a - vmul_vmls d8, d30, d25, d0[1], d0[0] // -> t33a + vmul_vmls d7, d30, d25, d0[1], d0[0] // -> t33a vrshr.s32 d26, d4, #12 // t34a vmul_vmla d4, d30, d25, d0[0], d0[1] // -> t62a vrshr.s32 d29, d6, #12 // t61a - vrshr.s32 d25, d8, #12 // t33a + vrshr.s32 d25, d7, #12 // t33a vrshr.s32 d30, d4, #12 // t62a vqadd.s32 d16, d24, d27 // t32a @@ -2764,13 +2909,20 @@ function inv_dct64_step1_neon vqsub.s32 d21, d30, d29 // t61 vqadd.s32 d22, d30, d29 // t62 +.irp r, q8, q9, q10, q11 + vmin.s32 \r, \r, q5 +.endr +.irp r, q8, q9, q10, q11 + vmax.s32 \r, \r, q4 +.endr + vmul_vmla d4, d21, d18, d1[0], d1[1] // -> t61a vmul_vmls d6, d21, d18, d1[1], d1[0] // -> t34a - vmul_vmla d8, d20, d19, d1[0], d1[1] // -> t60 + vmul_vmla d7, d20, d19, d1[0], d1[1] // -> t60 vrshr.s32 d21, d4, #12 // t61a vrshr.s32 d18, d6, #12 // t34a vmul_vmls d4, d20, d19, d1[1], d1[0] // -> t35 - vrshr.s32 d20, d8, #12 // t60 + vrshr.s32 d20, d7, #12 // t60 vrshr.s32 d19, d4, #12 // t35 vst1.32 {d16, d17, d18, d19}, [r6, :128]! @@ -2805,14 +2957,21 @@ function inv_dct64_step2_neon vqadd.s32 d30, d23, d22 // t48 vqsub.s32 d31, d23, d22 // t55 +.irp r, q12, q13, q14, q15 + vmin.s32 \r, \r, q5 +.endr +.irp r, q12, q13, q14, q15 + vmax.s32 \r, \r, q4 +.endr + vmul_vmla d4, d27, d25, d1[1], d1[0] // -> t56a vmul_vmls d6, d27, d25, d1[0], d1[1] // -> t39a - vmul_vmla d8, d31, d28, d1[1], d1[0] // -> t40a + vmul_vmla d7, d31, d28, d1[1], d1[0] // -> t40a vrshr.s32 d25, d4, #12 // t56a vrshr.s32 d27, d6, #12 // t39a - vneg.s32 d8, d8 // t40a + vneg.s32 d7, d7 // t40a vmul_vmls d4, d31, d28, d1[0], d1[1] // -> t55a - vrshr.s32 d31, d8, #12 // t40a + vrshr.s32 d31, d7, #12 // t40a vrshr.s32 d28, d4, #12 // t55a vqadd.s32 d16, d24, d29 // t32a @@ -2824,13 +2983,20 @@ function inv_dct64_step2_neon vqsub.s32 d21, d25, d28 // t55 vqadd.s32 d22, d25, d28 // t56 +.irp r, q8, q9, q10, q11 + vmin.s32 \r, \r, q5 +.endr +.irp r, q8, q9, q10, q11 + vmax.s32 \r, \r, q4 +.endr + vmul_vmls d4, d21, d18, d0[0], d0[0] // -> t40a vmul_vmla d6, d21, d18, d0[0], d0[0] // -> t55a - vmul_vmls d8, d20, d19, d0[0], d0[0] // -> t47 + vmul_vmls d7, d20, d19, d0[0], d0[0] // -> t47 vrshr.s32 d18, d4, #12 // t40a vrshr.s32 d21, d6, #12 // t55a vmul_vmla d4, d20, d19, d0[0], d0[0] // -> t48 - vrshr.s32 d19, d8, #12 // t47 + vrshr.s32 d19, d7, #12 // t47 vrshr.s32 d20, d4, #12 // t48 vstr d16, [r6, #4*2*0] // t32a @@ -2916,6 +3082,17 @@ function inv_txfm_dct\suffix\()_2s_x64_neon bl inv_dct_2s_x16_neon + // idct_16 leaves the row_clip_max/min constants in d9 and d8, + // but here we want to use full q registers for clipping. + vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, q8, q9, q10, q11, q12, q13, q14, q15 + vmin.s32 \r, \r, q3 +.endr +.irp r, q8, q9, q10, q11, q12, q13, q14, q15 + vmax.s32 \r, \r, q2 +.endr + store16 r6 movdup_if d0, r12, 2896*8*(1<<16), \scale @@ -2934,6 +3111,8 @@ function inv_txfm_dct\suffix\()_2s_x64_neon mov r9, #-8 + vmov.i32 d1, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 d0, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 .macro store_addsub r0, r1, r2, r3 vld1.32 {d2}, [r6, :64]! vld1.32 {d3}, [r6, :64]! @@ -2942,16 +3121,32 @@ function inv_txfm_dct\suffix\()_2s_x64_neon vld1.32 {d4}, [r6, :64]! vqadd.s32 d7, d3, \r1 vqsub.s32 \r1, d3, \r1 + vmin.s32 d6, d6, d1 + vmin.s32 \r0, \r0, d1 vld1.32 {d5}, [r6, :64]! vqadd.s32 d2, d4, \r2 sub r6, r6, #8*4 + vmax.s32 d6, d6, d0 + vmax.s32 \r0, \r0, d0 vqsub.s32 \r2, d4, \r2 + vmin.s32 d7, d7, d1 + vmin.s32 \r1, \r1, d1 vst1.32 {d6}, [r6, :64]! vst1.32 {\r0}, [r10, :64], r9 + vmin.s32 d2, d2, d1 + vmin.s32 \r2, \r2, d1 + vmax.s32 d7, d7, d0 + vmax.s32 \r1, \r1, d0 vqadd.s32 d3, d5, \r3 vqsub.s32 \r3, d5, \r3 + vmax.s32 d2, d2, d0 + vmax.s32 \r2, \r2, d0 + vmin.s32 d3, d3, d1 + vmin.s32 \r3, \r3, d1 vst1.32 {d7}, [r6, :64]! vst1.32 {\r1}, [r10, :64], r9 + vmax.s32 d3, d3, d0 + vmax.s32 \r3, \r3, d0 vst1.32 {d2}, [r6, :64]! vst1.32 {\r2}, [r10, :64], r9 vst1.32 {d3}, [r6, :64]! @@ -2966,6 +3161,8 @@ function inv_txfm_dct\suffix\()_2s_x64_neon add r6, r6, #2*4*16 movrel_local r12, idct64_coeffs + vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff + vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 movdup_if d0, lr, 2896*8*(1<<16), \scale vmov_if d7, #0, \clear add r9, r7, r8, lsl #4 // offset 16 diff --git a/src/arm/64/itx16.S b/src/arm/64/itx16.S index 0a0c776..eee3a96 100644 --- a/src/arm/64/itx16.S +++ b/src/arm/64/itx16.S @@ -124,6 +124,13 @@ endconst .endif .endm +.macro smin_4s r0, r1, r2 + smin \r0\().4s, \r1\().4s, \r2\().4s +.endm +.macro smax_4s r0, r1, r2 + smax \r0\().4s, \r1\().4s, \r2\().4s +.endm + .macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4 .ifnb \load ld1 {\load}, [\src], x1 @@ -599,12 +606,21 @@ def_fn_4x4 identity, flipadst .macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7 idct_4 \r0, \r2, \r4, \r6 + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 +.irp r, \r0, \r2, \r4, \r6 + smin_4s \r, \r, v5 +.endr +.irp r, \r0, \r2, \r4, \r6 + smax_4s \r, \r, v4 +.endr + mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a - mul_mla v4, \r1, \r7, v1.s[1], v1.s[0] // -> t7a + mul_mla v3, \r1, \r7, v1.s[1], v1.s[0] // -> t7a mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a srshr \r1\().4s, v2.4s, #12 // t4a - srshr \r7\().4s, v4.4s, #12 // t7a + srshr \r7\().4s, v3.4s, #12 // t7a srshr \r3\().4s, v6.4s, #12 // t5a srshr \r5\().4s, v7.4s, #12 // t6a @@ -613,17 +629,24 @@ def_fn_4x4 identity, flipadst sqadd v3.4s, \r7\().4s, \r5\().4s // t7 sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a - mul_mls v4, \r3, \r1, v0.s[0], v0.s[0] // -> t5 +.irp r, v2, \r1, v3, \r3 + smin_4s \r, \r, v5 +.endr +.irp r, v2, \r1, v3, \r3 + smax_4s \r, \r, v4 +.endr + + mul_mls v7, \r3, \r1, v0.s[0], v0.s[0] // -> t5 mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6 - srshr v4.4s, v4.4s, #12 // t5 - srshr v5.4s, v6.4s, #12 // t6 + srshr v7.4s, v7.4s, #12 // t5 + srshr v6.4s, v6.4s, #12 // t6 sqsub \r7\().4s, \r0\().4s, v3.4s // out7 sqadd \r0\().4s, \r0\().4s, v3.4s // out0 - sqadd \r1\().4s, \r2\().4s, v5.4s // out1 - sqsub v6.4s, \r2\().4s, v5.4s // out6 - sqadd \r2\().4s, \r4\().4s, v4.4s // out2 - sqsub \r5\().4s, \r4\().4s, v4.4s // out5 + sqadd \r1\().4s, \r2\().4s, v6.4s // out1 + sqsub v6.4s, \r2\().4s, v6.4s // out6 + sqadd \r2\().4s, \r4\().4s, v7.4s // out2 + sqsub \r5\().4s, \r4\().4s, v7.4s // out5 sqadd \r3\().4s, \r6\().4s, v2.4s // out3 sqsub \r4\().4s, \r6\().4s, v2.4s // out4 mov \r6\().16b, v6.16b // out6 @@ -660,8 +683,11 @@ endfunc ld1 {v0.4s}, [x16] + movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + sqadd v2.4s, v16.4s, v20.4s // t0 sqsub v3.4s, v16.4s, v20.4s // t4 + mvni v20.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 sqadd v4.4s, v23.4s, v19.4s // t1 sqsub v5.4s, v23.4s, v19.4s // t5 sqadd v6.4s, v18.4s, v22.4s // t2 @@ -669,6 +695,13 @@ endfunc sqadd v18.4s, v21.4s, v17.4s // t3 sqsub v19.4s, v21.4s, v17.4s // t7 +.irp r, v2, v3, v4, v5, v6, v7, v18, v19 + smin_4s \r, \r, v1 +.endr +.irp r, v2, v3, v4, v5, v6, v7, v18, v19 + smax_4s \r, \r, v20 +.endr + mul_mla v16, v3, v5, v0.s[3], v0.s[2] mul_mls v20, v3, v5, v0.s[2], v0.s[3] mul_mls v22, v19, v7, v0.s[3], v0.s[2] @@ -685,12 +718,24 @@ endfunc sqsub v2.4s, v2.4s, v6.4s // t2 sqadd \o7\().4s, v4.4s, v18.4s // out7 sqsub v4.4s, v4.4s, v18.4s // t3 - sqneg \o7\().4s, \o7\().4s // out7 + + mvni v18.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 sqadd \o1\().4s, v3.4s, v7.4s // out1 sqsub v3.4s, v3.4s, v7.4s // t6 sqadd \o6\().4s, v5.4s, v19.4s // out6 sqsub v5.4s, v5.4s, v19.4s // t7 + + // Not clipping the output registers, as they will be downshifted and + // narrowed afterwards anyway. +.irp r, v2, v4, v3, v5 + smin_4s \r, \r, v1 +.endr +.irp r, v2, v4, v3, v5 + smax_4s \r, \r, v18 +.endr + + sqneg \o7\().4s, \o7\().4s // out7 sqneg \o1\().4s, \o1\().4s // out1 mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20) @@ -959,25 +1004,33 @@ function inv_dct_4s_x16_neon idct_8 v16, v18, v20, v22, v24, v26, v28, v30 + // idct_8 leaves the row_clip_max/min constants in v5 and v4 +.irp r, v16, v18, v20, v22, v24, v26, v28, v30 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v16, v18, v20, v22, v24, v26, v28, v30 + smax \r\().4s, \r\().4s, v4.4s +.endr + ld1 {v0.4s, v1.4s}, [x16] sub x16, x16, #32 mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a - mul_mla v4, v17, v31, v0.s[1], v0.s[0] // -> t15a + mul_mla v3, v17, v31, v0.s[1], v0.s[0] // -> t15a mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a srshr v17.4s, v2.4s, #12 // t8a - srshr v31.4s, v4.4s, #12 // t15a + srshr v31.4s, v3.4s, #12 // t15a mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a - mul_mls v4, v21, v27, v1.s[0], v1.s[1] // -> t10a + mul_mls v3, v21, v27, v1.s[0], v1.s[1] // -> t10a srshr v23.4s, v6.4s, #12 // t9a srshr v25.4s, v2.4s, #12 // t14a mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a - srshr v21.4s, v4.4s, #12 // t10a + srshr v21.4s, v3.4s, #12 // t10a srshr v27.4s, v6.4s, #12 // t13a - mul_mla v4, v29, v19, v1.s[3], v1.s[2] // -> t12a + mul_mla v3, v29, v19, v1.s[3], v1.s[2] // -> t12a srshr v19.4s, v2.4s, #12 // t11a - srshr v29.4s, v4.4s, #12 // t12a + srshr v29.4s, v3.4s, #12 // t12a ld1 {v0.4s}, [x16] @@ -990,14 +1043,21 @@ function inv_dct_4s_x16_neon sqadd v25.4s, v29.4s, v27.4s // t12 sqsub v29.4s, v29.4s, v27.4s // t13 - mul_mls v4, v3, v2, v0.s[2], v0.s[3] // -> t9a +.irp r, v2, v17, v3, v31, v23, v19, v25, v29 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v17, v3, v31, v23, v19, v25, v29 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v3, v2, v0.s[2], v0.s[3] // -> t9a mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a - srshr v21.4s, v4.4s, #12 // t9a + srshr v21.4s, v7.4s, #12 // t9a srshr v27.4s, v6.4s, #12 // t14a - mul_mls v4, v29, v23, v0.s[2], v0.s[3] // -> t13a + mul_mls v7, v29, v23, v0.s[2], v0.s[3] // -> t13a mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a - srshr v29.4s, v4.4s, #12 // t13a + srshr v29.4s, v7.4s, #12 // t13a neg v6.4s, v6.4s srshr v23.4s, v6.4s, #12 // t10a @@ -1010,34 +1070,41 @@ function inv_dct_4s_x16_neon sqsub v25.4s, v27.4s, v29.4s // t13 sqadd v27.4s, v27.4s, v29.4s // t14 - mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t11 +.irp r, v2, v17, v3, v31, v19, v21, v25, v27 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v17, v3, v31, v19, v21, v25, v27 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t11 mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12 mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a - srshr v4.4s, v4.4s, #12 // t11 - srshr v5.4s, v6.4s, #12 // t12 - mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t13a + srshr v7.4s, v7.4s, #12 // t11 + srshr v6.4s, v6.4s, #12 // t12 + mul_mla v3, v25, v21, v0.s[0], v0.s[0] // -> t13a srshr v2.4s, v2.4s, #12 // t10a - srshr v3.4s, v6.4s, #12 // t13a + srshr v3.4s, v3.4s, #12 // t13a - sqadd v6.4s, v16.4s, v31.4s // out0 + sqadd v1.4s, v16.4s, v31.4s // out0 sqsub v31.4s, v16.4s, v31.4s // out15 - mov v16.16b, v6.16b + mov v16.16b, v1.16b sqadd v23.4s, v30.4s, v17.4s // out7 - sqsub v7.4s, v30.4s, v17.4s // out8 + sqsub v1.4s, v30.4s, v17.4s // out8 sqadd v17.4s, v18.4s, v27.4s // out1 sqsub v30.4s, v18.4s, v27.4s // out14 sqadd v18.4s, v20.4s, v3.4s // out2 sqsub v29.4s, v20.4s, v3.4s // out13 sqadd v3.4s, v28.4s, v19.4s // out6 sqsub v25.4s, v28.4s, v19.4s // out9 - sqadd v19.4s, v22.4s, v5.4s // out3 - sqsub v28.4s, v22.4s, v5.4s // out12 - sqadd v20.4s, v24.4s, v4.4s // out4 - sqsub v27.4s, v24.4s, v4.4s // out11 + sqadd v19.4s, v22.4s, v6.4s // out3 + sqsub v28.4s, v22.4s, v6.4s // out12 + sqadd v20.4s, v24.4s, v7.4s // out4 + sqsub v27.4s, v24.4s, v7.4s // out11 sqadd v21.4s, v26.4s, v2.4s // out5 sqsub v26.4s, v26.4s, v2.4s // out10 - mov v24.16b, v7.16b + mov v24.16b, v1.16b mov v22.16b, v3.16b ret @@ -1084,6 +1151,9 @@ endfunc ld1 {v0.4s, v1.4s}, [x16] + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v7.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 + sqsub v2.4s, v16.4s, v23.4s // t8a sqadd v16.4s, v16.4s, v23.4s // t0a sqsub v3.4s, v31.4s, v24.4s // t9a @@ -1101,6 +1171,13 @@ endfunc sqadd v28.4s, v25.4s, v30.4s // t7a sqsub v25.4s, v25.4s, v30.4s // t15a +.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25 + smin_4s \r, \r, v5 +.endr +.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25 + smax_4s \r, \r, v7 +.endr + mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8 mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9 mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10 @@ -1135,6 +1212,13 @@ endfunc sqadd v20.4s, v29.4s, v22.4s // t11a sqsub v29.4s, v29.4s, v22.4s // t15a +.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29 + smin_4s \r, \r, v5 +.endr +.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29 + smax_4s \r, \r, v7 +.endr + mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a @@ -1163,24 +1247,34 @@ endfunc sqadd \o15\().4s, v31.4s, v26.4s // out15 mov \o0\().16b, v4.16b .endif - sqneg \o15\().4s, \o15\().4s // out15 sqsub v3.4s, v29.4s, v18.4s // t15a sqadd \o13\().4s, v29.4s, v18.4s // out13 sqadd \o2\().4s, v17.4s, v30.4s // out2 sqsub v26.4s, v17.4s, v30.4s // t14a - sqneg \o13\().4s, \o13\().4s // out13 sqadd \o1\().4s, v19.4s, v27.4s // out1 sqsub v27.4s, v19.4s, v27.4s // t10 sqadd \o14\().4s, v28.4s, v20.4s // out14 sqsub v20.4s, v28.4s, v20.4s // t11 - sqneg \o1\().4s, \o1\().4s // out1 sqadd \o3\().4s, v22.4s, v24.4s // out3 sqsub v22.4s, v22.4s, v24.4s // t6 sqadd \o12\().4s, v25.4s, v23.4s // out12 sqsub v23.4s, v25.4s, v23.4s // t7 + + // Not clipping the output registers, as they will be downshifted and + // narrowed afterwards anyway. +.irp r, v2, v21, v3, v26, v27, v20, v22, v23 + smin_4s \r, \r, v5 +.endr +.irp r, v2, v21, v3, v26, v27, v20, v22, v23 + smax_4s \r, \r, v7 +.endr + + sqneg \o15\().4s, \o15\().4s // out15 + sqneg \o13\().4s, \o13\().4s // out13 + sqneg \o1\().4s, \o1\().4s // out1 sqneg \o3\().4s, \o3\().4s // out3 mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23) @@ -1956,6 +2050,9 @@ function inv_dct32_odd_4s_x16_neon ld1 {v0.4s, v1.4s}, [x16] + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 + sqsub v2.4s, v16.4s, v24.4s // t17 sqadd v16.4s, v16.4s, v24.4s // t16 sqsub v3.4s, v31.4s, v23.4s // t30 @@ -1973,23 +2070,30 @@ function inv_dct32_odd_4s_x16_neon sqadd v25.4s, v19.4s, v27.4s // t28 sqsub v19.4s, v19.4s, v27.4s // t29 - mul_mls v4, v3, v2, v1.s[0], v1.s[1] // -> t17a +.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v3, v2, v1.s[0], v1.s[1] // -> t17a mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a - srshr v21.4s, v4.4s, #12 // t17a + srshr v21.4s, v7.4s, #12 // t17a srshr v27.4s, v6.4s, #12 // t30a neg v2.4s, v2.4s // -> t18a - mul_mls v4, v19, v24, v1.s[0], v1.s[1] // -> t29a + mul_mls v7, v19, v24, v1.s[0], v1.s[1] // -> t29a mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a srshr v19.4s, v2.4s, #12 // t18a - srshr v24.4s, v4.4s, #12 // t29a + srshr v24.4s, v7.4s, #12 // t29a mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a - mul_mla v4, v17, v20, v1.s[3], v1.s[2] // -> t22a + mul_mla v7, v17, v20, v1.s[3], v1.s[2] // -> t22a srshr v22.4s, v6.4s, #12 // t21a srshr v18.4s, v2.4s, #12 // t26a - neg v4.4s, v4.4s // -> t22a + neg v7.4s, v7.4s // -> t22a mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a - srshr v17.4s, v4.4s, #12 // t22a + srshr v17.4s, v7.4s, #12 // t22a srshr v20.4s, v6.4s, #12 // t25a sqsub v2.4s, v27.4s, v24.4s // t29 @@ -2009,23 +2113,30 @@ function inv_dct32_odd_4s_x16_neon sqsub v29.4s, v31.4s, v25.4s // t28a sqadd v31.4s, v31.4s, v25.4s // t31a - mul_mls v4, v2, v3, v0.s[2], v0.s[3] // -> t18a +.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v2, v3, v0.s[2], v0.s[3] // -> t18a mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19 - srshr v18.4s, v4.4s, #12 // t18a + srshr v18.4s, v7.4s, #12 // t18a srshr v25.4s, v6.4s, #12 // t29a - mul_mla v4, v29, v24, v0.s[3], v0.s[2] // -> t28 + mul_mla v7, v29, v24, v0.s[3], v0.s[2] // -> t28 mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20 srshr v29.4s, v2.4s, #12 // t19 - srshr v24.4s, v4.4s, #12 // t28 + srshr v24.4s, v7.4s, #12 // t28 neg v6.4s, v6.4s // -> t20 mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27 - mul_mla v4, v20, v28, v0.s[3], v0.s[2] // -> t21a + mul_mla v7, v20, v28, v0.s[3], v0.s[2] // -> t21a srshr v26.4s, v6.4s, #12 // t20 srshr v19.4s, v2.4s, #12 // t27 - neg v4.4s, v4.4s // -> t21a + neg v7.4s, v7.4s // -> t21a mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a - srshr v20.4s, v4.4s, #12 // t21a + srshr v20.4s, v7.4s, #12 // t21a srshr v28.4s, v6.4s, #12 // t26a sqsub v2.4s, v16.4s, v30.4s // t23 @@ -2038,33 +2149,40 @@ function inv_dct32_odd_4s_x16_neon sqsub v21.4s, v27.4s, v22.4s // t25a sqsub v27.4s, v18.4s, v20.4s // t21 sqadd v18.4s, v18.4s, v20.4s // t18 = out18 - sqadd v4.4s, v29.4s, v26.4s // t19a = out19 + sqadd v7.4s, v29.4s, v26.4s // t19a = out19 sqsub v26.4s, v29.4s, v26.4s // t20a sqadd v29.4s, v25.4s, v28.4s // t29 = out29 sqsub v25.4s, v25.4s, v28.4s // t26 sqadd v28.4s, v24.4s, v19.4s // t28a = out28 sqsub v24.4s, v24.4s, v19.4s // t27a - mov v19.16b, v4.16b // out19 + mov v19.16b, v7.16b // out19 - mul_mls v4, v24, v26, v0.s[0], v0.s[0] // -> t20 +.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24 + smin \r\().4s, \r\().4s, v5.4s +.endr +.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24 + smax \r\().4s, \r\().4s, v4.4s +.endr + + mul_mls v7, v24, v26, v0.s[0], v0.s[0] // -> t20 mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27 - srshr v20.4s, v4.4s, #12 // t20 + srshr v20.4s, v7.4s, #12 // t20 srshr v22.4s, v6.4s, #12 // t27 - mul_mla v4, v25, v27, v0.s[0], v0.s[0] // -> t26a + mul_mla v7, v25, v27, v0.s[0], v0.s[0] // -> t26a mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a mov v27.16b, v22.16b // t27 - srshr v26.4s, v4.4s, #12 // t26a + srshr v26.4s, v7.4s, #12 // t26a mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22 - mul_mla v4, v21, v23, v0.s[0], v0.s[0] // -> t25 + mul_mla v7, v21, v23, v0.s[0], v0.s[0] // -> t25 srshr v21.4s, v6.4s, #12 // t21a srshr v22.4s, v24.4s, #12 // t22 - srshr v25.4s, v4.4s, #12 // t25 + srshr v25.4s, v7.4s, #12 // t25 - mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t23a + mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t23a mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a - srshr v23.4s, v4.4s, #12 // t23a + srshr v23.4s, v7.4s, #12 // t23a srshr v24.4s, v6.4s, #12 // t24a ret @@ -2091,6 +2209,15 @@ function inv_txfm_horz\suffix\()_dct_32x4_neon scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 .endif bl inv_dct_4s_x16_neon + + // idct_16 leaves the row_clip_max/min constants in v5 and v4 +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5 @@ -2786,13 +2913,20 @@ function inv_dct64_step1_neon sqsub v30.4s, v23.4s, v22.4s // t62 sqadd v31.4s, v23.4s, v22.4s // t63 +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a - mul_mls v4, v29, v26, v0.s[1], v0.s[0] // -> t61a + mul_mls v7, v29, v26, v0.s[1], v0.s[0] // -> t61a neg v2.4s, v2.4s // t34a mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a srshr v26.4s, v2.4s, #12 // t34a mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a - srshr v29.4s, v4.4s, #12 // t61a + srshr v29.4s, v7.4s, #12 // t61a srshr v25.4s, v6.4s, #12 // t33a srshr v30.4s, v2.4s, #12 // t62a @@ -2805,11 +2939,18 @@ function inv_dct64_step1_neon sqsub v21.4s, v30.4s, v29.4s // t61 sqadd v22.4s, v30.4s, v29.4s // t62 +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smax_4s \r, \r, v4 +.endr + mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a - mul_mls v4, v21, v18, v0.s[3], v0.s[2] // -> t34a + mul_mls v7, v21, v18, v0.s[3], v0.s[2] // -> t34a mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60 srshr v21.4s, v2.4s, #12 // t61a - srshr v18.4s, v4.4s, #12 // t34a + srshr v18.4s, v7.4s, #12 // t34a mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35 srshr v20.4s, v6.4s, #12 // t60 srshr v19.4s, v2.4s, #12 // t35 @@ -2846,11 +2987,18 @@ function inv_dct64_step2_neon sqadd v30.4s, v23.4s, v22.4s // t48 sqsub v31.4s, v23.4s, v22.4s // t55 +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a - mul_mls v4, v27, v25, v0.s[2], v0.s[3] // -> t39a + mul_mls v7, v27, v25, v0.s[2], v0.s[3] // -> t39a mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a srshr v25.4s, v2.4s, #12 // t56a - srshr v27.4s, v4.4s, #12 // t39a + srshr v27.4s, v7.4s, #12 // t39a neg v6.4s, v6.4s // t40a mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a srshr v31.4s, v6.4s, #12 // t40a @@ -2865,11 +3013,18 @@ function inv_dct64_step2_neon sqsub v21.4s, v25.4s, v28.4s // t55 sqadd v22.4s, v25.4s, v28.4s // t56 +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v19, v17, v18, v20, v23, v21, v22 + smax_4s \r, \r, v4 +.endr + mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a - mul_mla v4, v21, v18, v0.s[0], v0.s[0] // -> t55a + mul_mla v7, v21, v18, v0.s[0], v0.s[0] // -> t55a mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47 srshr v18.4s, v2.4s, #12 // t40a - srshr v21.4s, v4.4s, #12 // t55a + srshr v21.4s, v7.4s, #12 // t55a mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48 srshr v19.4s, v6.4s, #12 // t47 srshr v20.4s, v2.4s, #12 // t48 @@ -2966,6 +3121,14 @@ function inv_txfm_dct\suffix\()_4s_x64_neon bl inv_dct_4s_x16_neon + // idct_16 leaves the row_clip_max/min constants in v5 and v4 +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smin_4s \r, \r, v5 +.endr +.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + smax_4s \r, \r, v4 +.endr + store16 x6 movz16dup_if v0.2s, w16, #2896*8, \scale @@ -2984,6 +3147,9 @@ function inv_txfm_dct\suffix\()_4s_x64_neon mov x9, #-16 + movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v0.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 + .macro store_addsub r0, r1, r2, r3 ld1 {v2.4s}, [x6], #16 ld1 {v3.4s}, [x6], #16 @@ -2992,16 +3158,32 @@ function inv_txfm_dct\suffix\()_4s_x64_neon ld1 {v4.4s}, [x6], #16 sqadd v7.4s, v3.4s, \r1 sqsub \r1, v3.4s, \r1 + smin v6.4s, v6.4s, v1.4s + smin \r0, \r0, v1.4s ld1 {v5.4s}, [x6], #16 sqadd v2.4s, v4.4s, \r2 sub x6, x6, #16*4 + smax v6.4s, v6.4s, v0.4s + smax \r0, \r0, v0.4s sqsub \r2, v4.4s, \r2 + smin v7.4s, v7.4s, v1.4s + smin \r1, \r1, v1.4s st1 {v6.4s}, [x6], #16 st1 {\r0}, [x10], x9 + smin v2.4s, v2.4s, v1.4s + smin \r2, \r2, v1.4s + smax v7.4s, v7.4s, v0.4s + smax \r1, \r1, v0.4s sqadd v3.4s, v5.4s, \r3 sqsub \r3, v5.4s, \r3 + smax v2.4s, v2.4s, v0.4s + smax \r2, \r2, v0.4s + smin v3.4s, v3.4s, v1.4s + smin \r3, \r3, v1.4s st1 {v7.4s}, [x6], #16 st1 {\r1}, [x10], x9 + smax v3.4s, v3.4s, v0.4s + smax \r3, \r3, v0.4s st1 {v2.4s}, [x6], #16 st1 {\r2}, [x10], x9 st1 {v3.4s}, [x6], #16 @@ -3016,6 +3198,8 @@ function inv_txfm_dct\suffix\()_4s_x64_neon add x6, x6, #4*4*16 movrel x17, idct64_coeffs + movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff + mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear add x9, x7, x8, lsl #4 // offset 16 -- cgit v1.2.3