Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2022-09-09 12:16:03 +0300
committerMartin Storsjö <martin@martin.st>2022-09-19 23:40:34 +0300
commit345127a79532a679de025b031bdbc264df5f17ea (patch)
tree98d856dc754837220ca1c2ee278f6b0b2310ba91
parent9c74a9b01d34ca0df44c5b38970e43273c459918 (diff)
arm: itx: Add clipping to row_clip_min/max in the 10 bpc codepaths
This fixes conformance with the argon test samples, in particular with these samples: profile0_core/streams/test10100_579_8614.obu profile0_core/streams/test10218_6914.obu This gives a pretty notable slowdown to these transforms - some examples: Before: Cortex A53 A72 A73 Apple M1 inv_txfm_add_8x8_dct_dct_1_10bpc_neon: 365.7 290.2 299.8 0.3 inv_txfm_add_16x16_dct_dct_2_10bpc_neon: 1865.2 1384.1 1457.5 2.6 inv_txfm_add_64x64_dct_dct_4_10bpc_neon: 33976.3 26817.0 24864.2 40.4 After: inv_txfm_add_8x8_dct_dct_1_10bpc_neon: 397.7 322.2 335.1 0.4 inv_txfm_add_16x16_dct_dct_2_10bpc_neon: 2121.9 1336.7 1664.6 2.6 inv_txfm_add_64x64_dct_dct_4_10bpc_neon: 38569.4 27622.6 28176.0 51.0 Thus, for the transforms alone, it makes them around 10-13% slower (the Apple M1 measurements are too noisy to be conclusive here). Measured on actual full decoding, it makes decoding of 10 bpc Chimera around maybe 1% slower on an Apple M1 - close to measurement noise anyway.
-rw-r--r--src/arm/32/itx16.S239
-rw-r--r--src/arm/64/itx16.S320
2 files changed, 470 insertions, 89 deletions
diff --git a/src/arm/32/itx16.S b/src/arm/32/itx16.S
index db8ecff..aa6c272 100644
--- a/src/arm/32/itx16.S
+++ b/src/arm/32/itx16.S
@@ -668,12 +668,21 @@ def_fn_4x4 identity, flipadst
.macro idct_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7
idct_4s_x4 \r0, \r2, \r4, \r6
+ vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ vmax.s32 \r, \r, q4
+.endr
+
vmul_vmls q2, \r1, \r7, d2[0], d2[1] // -> t4a
- vmul_vmla q4, \r1, \r7, d2[1], d2[0] // -> t7a
+ vmul_vmla q3, \r1, \r7, d2[1], d2[0] // -> t7a
vmul_vmls q6, \r5, \r3, d3[0], d3[1] // -> t5a
vmul_vmla q7, \r5, \r3, d3[1], d3[0] // -> t6a
vrshr.s32 \r1, q2, #12 // t4a
- vrshr.s32 \r7, q4, #12 // t7a
+ vrshr.s32 \r7, q3, #12 // t7a
vrshr.s32 \r3, q6, #12 // t5a
vrshr.s32 \r5, q7, #12 // t6a
@@ -682,17 +691,24 @@ def_fn_4x4 identity, flipadst
vqadd.s32 q3, \r7, \r5 // t7
vqsub.s32 \r3, \r7, \r5 // t6a
- vmul_vmls q4, \r3, \r1, d0[0], d0[0] // -> t5
+.irp r, q2, \r1, q3, \r3
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q2, \r1, q3, \r3
+ vmax.s32 \r, \r, q4
+.endr
+
+ vmul_vmls q7, \r3, \r1, d0[0], d0[0] // -> t5
vmul_vmla q6, \r3, \r1, d0[0], d0[0] // -> t6
- vrshr.s32 q4, q4, #12 // t5
+ vrshr.s32 q7, q7, #12 // t5
vrshr.s32 q5, q6, #12 // t6
vqsub.s32 \r7, \r0, q3 // out7
vqadd.s32 \r0, \r0, q3 // out0
vqadd.s32 \r1, \r2, q5 // out1
vqsub.s32 q6, \r2, q5 // out6
- vqadd.s32 \r2, \r4, q4 // out2
- vqsub.s32 \r5, \r4, q4 // out5
+ vqadd.s32 \r2, \r4, q7 // out2
+ vqsub.s32 \r5, \r4, q7 // out5
vqadd.s32 \r3, \r6, q2 // out3
vqsub.s32 \r4, \r6, q2 // out4
vmov \r6, q6 // out6
@@ -701,6 +717,15 @@ def_fn_4x4 identity, flipadst
.macro idct_2s_x8 r0, r1, r2, r3, r4, r5, r6, r7
idct_2s_x4 \r0, \r2, \r4, \r6
+ vmov.i32 d9, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d8, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ vmax.s32 \r, \r, d8
+.endr
+
vmul_vmls d4, \r1, \r7, d2[0], d2[1] // -> t4a
vmul_vmla d5, \r1, \r7, d2[1], d2[0] // -> t7a
vmul_vmls d6, \r5, \r3, d3[0], d3[1] // -> t5a
@@ -715,6 +740,13 @@ def_fn_4x4 identity, flipadst
vqadd.s32 d5, \r7, \r5 // t7
vqsub.s32 \r3, \r7, \r5 // t6a
+.irp r, d4, \r1, d5, \r3
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d4, \r1, d5, \r3
+ vmax.s32 \r, \r, d8
+.endr
+
vmul_vmls d6, \r3, \r1, d0[0], d0[0] // -> t5
vmul_vmla d7, \r3, \r1, d0[0], d0[0] // -> t6
vrshr.s32 d6, d6, #12 // t5
@@ -763,19 +795,28 @@ endfunc
vqadd.s32 q2, q8, q12 // t0
vqsub.s32 q3, q8, q12 // t4
+ vmov.i32 q12, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
vqadd.s32 q4, q15, q11 // t1
vqsub.s32 q5, q15, q11 // t5
vqadd.s32 q6, q10, q14 // t2
vqsub.s32 q7, q10, q14 // t6
+ vmvn.i32 q14, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
vqadd.s32 q10, q13, q9 // t3
vqsub.s32 q11, q13, q9 // t7
+.irp r, q2, q3, q4, q5, q6, q7, q10, q11
+ vmin.s32 \r, \r, q12
+.endr
+.irp r, q2, q3, q4, q5, q6, q7, q10, q11
+ vmax.s32 \r, \r, q14
+.endr
+
vmul_vmla q8, q3, q5, d1[1], d1[0]
- vmul_vmls q12, q3, q5, d1[0], d1[1]
+ vmul_vmls q13, q3, q5, d1[0], d1[1]
vmul_vmls q14, q11, q7, d1[1], d1[0]
vrshr.s32 q3, q8, #12 // t4a
- vrshr.s32 q5, q12, #12 // t5a
+ vrshr.s32 q5, q13, #12 // t5a
vmul_vmla q8, q11, q7, d1[0], d1[1]
@@ -786,12 +827,24 @@ endfunc
vqsub.s32 q2, q2, q6 // t2
vqadd.s32 \r7, q4, q10 // out7
vqsub.s32 q4, q4, q10 // t3
- vqneg.s32 \r7, \r7 // out7
+
+ vmvn.i32 q10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
vqadd.s32 \r1, q3, q7 // out1
vqsub.s32 q3, q3, q7 // t6
vqadd.s32 \r6, q5, q11 // out6
vqsub.s32 q5, q5, q11 // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, q2, q4, q3, q5
+ vmin.s32 \r, \r, q12
+.endr
+.irp r, q2, q4, q3, q5
+ vmax.s32 \r, \r, q10
+.endr
+
+ vqneg.s32 \r7, \r7 // out7
vqneg.s32 \r1, \r1 // out1
vmul_vmla q10, q2, q4, d0[0], d0[0] // -> out3 (q11 or q12)
@@ -1068,6 +1121,14 @@ function inv_dct_2s_x16_neon
idct_2s_x8 d16, d18, d20, d22, d24, d26, d28, d30
+ // idct_8 leaves the row_clip_max/min constants in d9 and d8
+.irp r, d16, d18, d20, d22, d24, d26, d28, d30
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d16, d18, d20, d22, d24, d26, d28, d30
+ vmax.s32 \r, \r, d8
+.endr
+
vld1.32 {q0, q1}, [r12, :128]
sub r12, r12, #32
@@ -1099,6 +1160,13 @@ function inv_dct_2s_x16_neon
vqadd.s32 d25, d29, d27 // t12
vqsub.s32 d29, d29, d27 // t13
+.irp r, d4, d17, d5, d31, d23, d19, d25, d29
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d4, d17, d5, d31, d23, d19, d25, d29
+ vmax.s32 \r, \r, d8
+.endr
+
vmul_vmls d6, d5, d4, d1[0], d1[1] // -> t9a
vmul_vmla d7, d5, d4, d1[1], d1[0] // -> t14a
vrshr.s32 d21, d6, #12 // t9a
@@ -1119,6 +1187,13 @@ function inv_dct_2s_x16_neon
vqsub.s32 d25, d27, d29 // t13
vqadd.s32 d27, d27, d29 // t14
+.irp r, d4, d17, d5, d31, d19, d21, d25, d27
+ vmin.s32 \r, \r, d9
+.endr
+.irp r, d4, d17, d5, d31, d19, d21, d25, d27
+ vmax.s32 \r, \r, d8
+.endr
+
vmul_vmls d6, d5, d4, d0[0], d0[0] // -> t11
vmul_vmla d7, d5, d4, d0[0], d0[0] // -> t12
vmul_vmls d4, d25, d21, d0[0], d0[0] // -> t10a
@@ -1193,6 +1268,9 @@ endfunc
vld1.32 {q0, q1}, [r12, :128]
+ vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+
vqsub.s32 d5, d16, d23 // t8a
vqadd.s32 d16, d16, d23 // t0a
vqsub.s32 d7, d31, d24 // t9a
@@ -1210,6 +1288,13 @@ endfunc
vqadd.s32 d28, d25, d30 // t7a
vqsub.s32 d25, d25, d30 // t15a
+.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25
+ vmax.s32 \r, \r, d10
+.endr
+
vmul_vmla d4, d5, d7, d2[1], d2[0] // -> t8
vmul_vmls d6, d5, d7, d2[0], d2[1] // -> t9
vmul_vmla d8, d18, d29, d3[1], d3[0] // -> t10
@@ -1244,6 +1329,13 @@ endfunc
vqadd.s32 d20, d29, d22 // t11a
vqsub.s32 d29, d29, d22 // t15a
+.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29
+ vmax.s32 \r, \r, d10
+.endr
+
vmul_vmla d4, d2, d3, d1[1], d1[0] // -> t4a
vmul_vmls d6, d2, d3, d1[0], d1[1] // -> t5a
vmul_vmls d8, d24, d23, d1[1], d1[0] // -> t6a
@@ -1272,24 +1364,34 @@ endfunc
vqadd.s32 \o15,d31, d26 // out15
vmov \o0, d4
.endif
- vqneg.s32 \o15, \o15 // out15
vqsub.s32 d3, d29, d18 // t15a
vqadd.s32 \o13,d29, d18 // out13
vqadd.s32 \o2, d17, d30 // out2
vqsub.s32 d26, d17, d30 // t14a
- vqneg.s32 \o13,\o13 // out13
vqadd.s32 \o1, d19, d27 // out1
vqsub.s32 d27, d19, d27 // t10
vqadd.s32 \o14,d28, d20 // out14
vqsub.s32 d20, d28, d20 // t11
- vqneg.s32 \o1, \o1 // out1
vqadd.s32 \o3, d22, d24 // out3
vqsub.s32 d22, d22, d24 // t6
vqadd.s32 \o12,d25, d23 // out12
vqsub.s32 d23, d25, d23 // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, d2, d21, d3, d26, d27, d20, d22, d23
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d21, d3, d26, d27, d20, d22, d23
+ vmax.s32 \r, \r, d10
+.endr
+
+ vqneg.s32 \o15, \o15 // out15
+ vqneg.s32 \o13,\o13 // out13
+ vqneg.s32 \o1, \o1 // out1
vqneg.s32 \o3, \o3 // out3
vmul_vmls d24, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23)
@@ -1947,6 +2049,9 @@ function inv_dct32_odd_2s_x16_neon
vld1.32 {q0, q1}, [r12, :128]
+ vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+
vqsub.s32 d5, d16, d24 // t17
vqadd.s32 d16, d16, d24 // t16
vqsub.s32 d7, d31, d23 // t30
@@ -1964,6 +2069,13 @@ function inv_dct32_odd_2s_x16_neon
vqadd.s32 d25, d19, d27 // t28
vqsub.s32 d19, d19, d27 // t29
+.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19
+ vmax.s32 \r, \r, d10
+.endr
+
vmul_vmls d4, d7, d5, d2[0], d2[1] // -> t17a
vmul_vmla d6, d7, d5, d2[1], d2[0] // -> t30a
vmul_vmla d8, d19, d24, d2[1], d2[0] // -> t18a
@@ -2000,6 +2112,13 @@ function inv_dct32_odd_2s_x16_neon
vqsub.s32 d29, d31, d25 // t28a
vqadd.s32 d31, d31, d25 // t31a
+.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31
+ vmax.s32 \r, \r, d10
+.endr
+
vmul_vmls d4, d2, d3, d1[0], d1[1] // -> t18a
vmul_vmla d6, d2, d3, d1[1], d1[0] // -> t29a
vmul_vmls d8, d29, d24, d1[0], d1[1] // -> t19
@@ -2037,6 +2156,13 @@ function inv_dct32_odd_2s_x16_neon
vqsub.s32 d24, d24, d19 // t27a
vmov d19, d4 // out19
+.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24
+ vmin.s32 \r, \r, d11
+.endr
+.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24
+ vmax.s32 \r, \r, d10
+.endr
+
vmul_vmls d4, d24, d26, d0[0], d0[0] // -> t20
vmul_vmla d6, d24, d26, d0[0], d0[0] // -> t27
vrshr.s32 d20, d4, #12 // t20
@@ -2081,6 +2207,18 @@ function inv_txfm_horz\suffix\()_dct_32x2_neon
scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
.endif
bl inv_dct_2s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in d9 and d8,
+ // but here we want to use full q registers for clipping.
+ vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmin.s32 \r, \r, q3
+.endr
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmax.s32 \r, \r, q2
+.endr
+
vtrn.32 d16, d17
vtrn.32 d18, d19
vtrn.32 d20, d21
@@ -2745,14 +2883,21 @@ function inv_dct64_step1_neon
vqsub.s32 d30, d23, d22 // t62
vqadd.s32 d31, d23, d22 // t63
+.irp r, q12, q13, q14, q15
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q12, q13, q14, q15
+ vmax.s32 \r, \r, q4
+.endr
+
vmul_vmla d4, d29, d26, d0[0], d0[1] // -> t34a
vmul_vmls d6, d29, d26, d0[1], d0[0] // -> t61a
vneg.s32 d4, d4 // t34a
- vmul_vmls d8, d30, d25, d0[1], d0[0] // -> t33a
+ vmul_vmls d7, d30, d25, d0[1], d0[0] // -> t33a
vrshr.s32 d26, d4, #12 // t34a
vmul_vmla d4, d30, d25, d0[0], d0[1] // -> t62a
vrshr.s32 d29, d6, #12 // t61a
- vrshr.s32 d25, d8, #12 // t33a
+ vrshr.s32 d25, d7, #12 // t33a
vrshr.s32 d30, d4, #12 // t62a
vqadd.s32 d16, d24, d27 // t32a
@@ -2764,13 +2909,20 @@ function inv_dct64_step1_neon
vqsub.s32 d21, d30, d29 // t61
vqadd.s32 d22, d30, d29 // t62
+.irp r, q8, q9, q10, q11
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q8, q9, q10, q11
+ vmax.s32 \r, \r, q4
+.endr
+
vmul_vmla d4, d21, d18, d1[0], d1[1] // -> t61a
vmul_vmls d6, d21, d18, d1[1], d1[0] // -> t34a
- vmul_vmla d8, d20, d19, d1[0], d1[1] // -> t60
+ vmul_vmla d7, d20, d19, d1[0], d1[1] // -> t60
vrshr.s32 d21, d4, #12 // t61a
vrshr.s32 d18, d6, #12 // t34a
vmul_vmls d4, d20, d19, d1[1], d1[0] // -> t35
- vrshr.s32 d20, d8, #12 // t60
+ vrshr.s32 d20, d7, #12 // t60
vrshr.s32 d19, d4, #12 // t35
vst1.32 {d16, d17, d18, d19}, [r6, :128]!
@@ -2805,14 +2957,21 @@ function inv_dct64_step2_neon
vqadd.s32 d30, d23, d22 // t48
vqsub.s32 d31, d23, d22 // t55
+.irp r, q12, q13, q14, q15
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q12, q13, q14, q15
+ vmax.s32 \r, \r, q4
+.endr
+
vmul_vmla d4, d27, d25, d1[1], d1[0] // -> t56a
vmul_vmls d6, d27, d25, d1[0], d1[1] // -> t39a
- vmul_vmla d8, d31, d28, d1[1], d1[0] // -> t40a
+ vmul_vmla d7, d31, d28, d1[1], d1[0] // -> t40a
vrshr.s32 d25, d4, #12 // t56a
vrshr.s32 d27, d6, #12 // t39a
- vneg.s32 d8, d8 // t40a
+ vneg.s32 d7, d7 // t40a
vmul_vmls d4, d31, d28, d1[0], d1[1] // -> t55a
- vrshr.s32 d31, d8, #12 // t40a
+ vrshr.s32 d31, d7, #12 // t40a
vrshr.s32 d28, d4, #12 // t55a
vqadd.s32 d16, d24, d29 // t32a
@@ -2824,13 +2983,20 @@ function inv_dct64_step2_neon
vqsub.s32 d21, d25, d28 // t55
vqadd.s32 d22, d25, d28 // t56
+.irp r, q8, q9, q10, q11
+ vmin.s32 \r, \r, q5
+.endr
+.irp r, q8, q9, q10, q11
+ vmax.s32 \r, \r, q4
+.endr
+
vmul_vmls d4, d21, d18, d0[0], d0[0] // -> t40a
vmul_vmla d6, d21, d18, d0[0], d0[0] // -> t55a
- vmul_vmls d8, d20, d19, d0[0], d0[0] // -> t47
+ vmul_vmls d7, d20, d19, d0[0], d0[0] // -> t47
vrshr.s32 d18, d4, #12 // t40a
vrshr.s32 d21, d6, #12 // t55a
vmul_vmla d4, d20, d19, d0[0], d0[0] // -> t48
- vrshr.s32 d19, d8, #12 // t47
+ vrshr.s32 d19, d7, #12 // t47
vrshr.s32 d20, d4, #12 // t48
vstr d16, [r6, #4*2*0] // t32a
@@ -2916,6 +3082,17 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
bl inv_dct_2s_x16_neon
+ // idct_16 leaves the row_clip_max/min constants in d9 and d8,
+ // but here we want to use full q registers for clipping.
+ vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmin.s32 \r, \r, q3
+.endr
+.irp r, q8, q9, q10, q11, q12, q13, q14, q15
+ vmax.s32 \r, \r, q2
+.endr
+
store16 r6
movdup_if d0, r12, 2896*8*(1<<16), \scale
@@ -2934,6 +3111,8 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
mov r9, #-8
+ vmov.i32 d1, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 d0, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
.macro store_addsub r0, r1, r2, r3
vld1.32 {d2}, [r6, :64]!
vld1.32 {d3}, [r6, :64]!
@@ -2942,16 +3121,32 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
vld1.32 {d4}, [r6, :64]!
vqadd.s32 d7, d3, \r1
vqsub.s32 \r1, d3, \r1
+ vmin.s32 d6, d6, d1
+ vmin.s32 \r0, \r0, d1
vld1.32 {d5}, [r6, :64]!
vqadd.s32 d2, d4, \r2
sub r6, r6, #8*4
+ vmax.s32 d6, d6, d0
+ vmax.s32 \r0, \r0, d0
vqsub.s32 \r2, d4, \r2
+ vmin.s32 d7, d7, d1
+ vmin.s32 \r1, \r1, d1
vst1.32 {d6}, [r6, :64]!
vst1.32 {\r0}, [r10, :64], r9
+ vmin.s32 d2, d2, d1
+ vmin.s32 \r2, \r2, d1
+ vmax.s32 d7, d7, d0
+ vmax.s32 \r1, \r1, d0
vqadd.s32 d3, d5, \r3
vqsub.s32 \r3, d5, \r3
+ vmax.s32 d2, d2, d0
+ vmax.s32 \r2, \r2, d0
+ vmin.s32 d3, d3, d1
+ vmin.s32 \r3, \r3, d1
vst1.32 {d7}, [r6, :64]!
vst1.32 {\r1}, [r10, :64], r9
+ vmax.s32 d3, d3, d0
+ vmax.s32 \r3, \r3, d0
vst1.32 {d2}, [r6, :64]!
vst1.32 {\r2}, [r10, :64], r9
vst1.32 {d3}, [r6, :64]!
@@ -2966,6 +3161,8 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
add r6, r6, #2*4*16
movrel_local r12, idct64_coeffs
+ vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
movdup_if d0, lr, 2896*8*(1<<16), \scale
vmov_if d7, #0, \clear
add r9, r7, r8, lsl #4 // offset 16
diff --git a/src/arm/64/itx16.S b/src/arm/64/itx16.S
index 0a0c776..eee3a96 100644
--- a/src/arm/64/itx16.S
+++ b/src/arm/64/itx16.S
@@ -124,6 +124,13 @@ endconst
.endif
.endm
+.macro smin_4s r0, r1, r2
+ smin \r0\().4s, \r1\().4s, \r2\().4s
+.endm
+.macro smax_4s r0, r1, r2
+ smax \r0\().4s, \r1\().4s, \r2\().4s
+.endm
+
.macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4
.ifnb \load
ld1 {\load}, [\src], x1
@@ -599,12 +606,21 @@ def_fn_4x4 identity, flipadst
.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7
idct_4 \r0, \r2, \r4, \r6
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ smin_4s \r, \r, v5
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ smax_4s \r, \r, v4
+.endr
+
mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a
- mul_mla v4, \r1, \r7, v1.s[1], v1.s[0] // -> t7a
+ mul_mla v3, \r1, \r7, v1.s[1], v1.s[0] // -> t7a
mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a
mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a
srshr \r1\().4s, v2.4s, #12 // t4a
- srshr \r7\().4s, v4.4s, #12 // t7a
+ srshr \r7\().4s, v3.4s, #12 // t7a
srshr \r3\().4s, v6.4s, #12 // t5a
srshr \r5\().4s, v7.4s, #12 // t6a
@@ -613,17 +629,24 @@ def_fn_4x4 identity, flipadst
sqadd v3.4s, \r7\().4s, \r5\().4s // t7
sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a
- mul_mls v4, \r3, \r1, v0.s[0], v0.s[0] // -> t5
+.irp r, v2, \r1, v3, \r3
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, \r1, v3, \r3
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mls v7, \r3, \r1, v0.s[0], v0.s[0] // -> t5
mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6
- srshr v4.4s, v4.4s, #12 // t5
- srshr v5.4s, v6.4s, #12 // t6
+ srshr v7.4s, v7.4s, #12 // t5
+ srshr v6.4s, v6.4s, #12 // t6
sqsub \r7\().4s, \r0\().4s, v3.4s // out7
sqadd \r0\().4s, \r0\().4s, v3.4s // out0
- sqadd \r1\().4s, \r2\().4s, v5.4s // out1
- sqsub v6.4s, \r2\().4s, v5.4s // out6
- sqadd \r2\().4s, \r4\().4s, v4.4s // out2
- sqsub \r5\().4s, \r4\().4s, v4.4s // out5
+ sqadd \r1\().4s, \r2\().4s, v6.4s // out1
+ sqsub v6.4s, \r2\().4s, v6.4s // out6
+ sqadd \r2\().4s, \r4\().4s, v7.4s // out2
+ sqsub \r5\().4s, \r4\().4s, v7.4s // out5
sqadd \r3\().4s, \r6\().4s, v2.4s // out3
sqsub \r4\().4s, \r6\().4s, v2.4s // out4
mov \r6\().16b, v6.16b // out6
@@ -660,8 +683,11 @@ endfunc
ld1 {v0.4s}, [x16]
+ movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+
sqadd v2.4s, v16.4s, v20.4s // t0
sqsub v3.4s, v16.4s, v20.4s // t4
+ mvni v20.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
sqadd v4.4s, v23.4s, v19.4s // t1
sqsub v5.4s, v23.4s, v19.4s // t5
sqadd v6.4s, v18.4s, v22.4s // t2
@@ -669,6 +695,13 @@ endfunc
sqadd v18.4s, v21.4s, v17.4s // t3
sqsub v19.4s, v21.4s, v17.4s // t7
+.irp r, v2, v3, v4, v5, v6, v7, v18, v19
+ smin_4s \r, \r, v1
+.endr
+.irp r, v2, v3, v4, v5, v6, v7, v18, v19
+ smax_4s \r, \r, v20
+.endr
+
mul_mla v16, v3, v5, v0.s[3], v0.s[2]
mul_mls v20, v3, v5, v0.s[2], v0.s[3]
mul_mls v22, v19, v7, v0.s[3], v0.s[2]
@@ -685,12 +718,24 @@ endfunc
sqsub v2.4s, v2.4s, v6.4s // t2
sqadd \o7\().4s, v4.4s, v18.4s // out7
sqsub v4.4s, v4.4s, v18.4s // t3
- sqneg \o7\().4s, \o7\().4s // out7
+
+ mvni v18.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
sqadd \o1\().4s, v3.4s, v7.4s // out1
sqsub v3.4s, v3.4s, v7.4s // t6
sqadd \o6\().4s, v5.4s, v19.4s // out6
sqsub v5.4s, v5.4s, v19.4s // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, v2, v4, v3, v5
+ smin_4s \r, \r, v1
+.endr
+.irp r, v2, v4, v3, v5
+ smax_4s \r, \r, v18
+.endr
+
+ sqneg \o7\().4s, \o7\().4s // out7
sqneg \o1\().4s, \o1\().4s // out1
mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20)
@@ -959,25 +1004,33 @@ function inv_dct_4s_x16_neon
idct_8 v16, v18, v20, v22, v24, v26, v28, v30
+ // idct_8 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v18, v20, v22, v24, v26, v28, v30
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v16, v18, v20, v22, v24, v26, v28, v30
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
ld1 {v0.4s, v1.4s}, [x16]
sub x16, x16, #32
mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a
- mul_mla v4, v17, v31, v0.s[1], v0.s[0] // -> t15a
+ mul_mla v3, v17, v31, v0.s[1], v0.s[0] // -> t15a
mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a
srshr v17.4s, v2.4s, #12 // t8a
- srshr v31.4s, v4.4s, #12 // t15a
+ srshr v31.4s, v3.4s, #12 // t15a
mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a
- mul_mls v4, v21, v27, v1.s[0], v1.s[1] // -> t10a
+ mul_mls v3, v21, v27, v1.s[0], v1.s[1] // -> t10a
srshr v23.4s, v6.4s, #12 // t9a
srshr v25.4s, v2.4s, #12 // t14a
mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a
mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a
- srshr v21.4s, v4.4s, #12 // t10a
+ srshr v21.4s, v3.4s, #12 // t10a
srshr v27.4s, v6.4s, #12 // t13a
- mul_mla v4, v29, v19, v1.s[3], v1.s[2] // -> t12a
+ mul_mla v3, v29, v19, v1.s[3], v1.s[2] // -> t12a
srshr v19.4s, v2.4s, #12 // t11a
- srshr v29.4s, v4.4s, #12 // t12a
+ srshr v29.4s, v3.4s, #12 // t12a
ld1 {v0.4s}, [x16]
@@ -990,14 +1043,21 @@ function inv_dct_4s_x16_neon
sqadd v25.4s, v29.4s, v27.4s // t12
sqsub v29.4s, v29.4s, v27.4s // t13
- mul_mls v4, v3, v2, v0.s[2], v0.s[3] // -> t9a
+.irp r, v2, v17, v3, v31, v23, v19, v25, v29
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v17, v3, v31, v23, v19, v25, v29
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v0.s[2], v0.s[3] // -> t9a
mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a
- srshr v21.4s, v4.4s, #12 // t9a
+ srshr v21.4s, v7.4s, #12 // t9a
srshr v27.4s, v6.4s, #12 // t14a
- mul_mls v4, v29, v23, v0.s[2], v0.s[3] // -> t13a
+ mul_mls v7, v29, v23, v0.s[2], v0.s[3] // -> t13a
mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a
- srshr v29.4s, v4.4s, #12 // t13a
+ srshr v29.4s, v7.4s, #12 // t13a
neg v6.4s, v6.4s
srshr v23.4s, v6.4s, #12 // t10a
@@ -1010,34 +1070,41 @@ function inv_dct_4s_x16_neon
sqsub v25.4s, v27.4s, v29.4s // t13
sqadd v27.4s, v27.4s, v29.4s // t14
- mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t11
+.irp r, v2, v17, v3, v31, v19, v21, v25, v27
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v17, v3, v31, v19, v21, v25, v27
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t11
mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12
mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a
- srshr v4.4s, v4.4s, #12 // t11
- srshr v5.4s, v6.4s, #12 // t12
- mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t13a
+ srshr v7.4s, v7.4s, #12 // t11
+ srshr v6.4s, v6.4s, #12 // t12
+ mul_mla v3, v25, v21, v0.s[0], v0.s[0] // -> t13a
srshr v2.4s, v2.4s, #12 // t10a
- srshr v3.4s, v6.4s, #12 // t13a
+ srshr v3.4s, v3.4s, #12 // t13a
- sqadd v6.4s, v16.4s, v31.4s // out0
+ sqadd v1.4s, v16.4s, v31.4s // out0
sqsub v31.4s, v16.4s, v31.4s // out15
- mov v16.16b, v6.16b
+ mov v16.16b, v1.16b
sqadd v23.4s, v30.4s, v17.4s // out7
- sqsub v7.4s, v30.4s, v17.4s // out8
+ sqsub v1.4s, v30.4s, v17.4s // out8
sqadd v17.4s, v18.4s, v27.4s // out1
sqsub v30.4s, v18.4s, v27.4s // out14
sqadd v18.4s, v20.4s, v3.4s // out2
sqsub v29.4s, v20.4s, v3.4s // out13
sqadd v3.4s, v28.4s, v19.4s // out6
sqsub v25.4s, v28.4s, v19.4s // out9
- sqadd v19.4s, v22.4s, v5.4s // out3
- sqsub v28.4s, v22.4s, v5.4s // out12
- sqadd v20.4s, v24.4s, v4.4s // out4
- sqsub v27.4s, v24.4s, v4.4s // out11
+ sqadd v19.4s, v22.4s, v6.4s // out3
+ sqsub v28.4s, v22.4s, v6.4s // out12
+ sqadd v20.4s, v24.4s, v7.4s // out4
+ sqsub v27.4s, v24.4s, v7.4s // out11
sqadd v21.4s, v26.4s, v2.4s // out5
sqsub v26.4s, v26.4s, v2.4s // out10
- mov v24.16b, v7.16b
+ mov v24.16b, v1.16b
mov v22.16b, v3.16b
ret
@@ -1084,6 +1151,9 @@ endfunc
ld1 {v0.4s, v1.4s}, [x16]
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v7.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
sqsub v2.4s, v16.4s, v23.4s // t8a
sqadd v16.4s, v16.4s, v23.4s // t0a
sqsub v3.4s, v31.4s, v24.4s // t9a
@@ -1101,6 +1171,13 @@ endfunc
sqadd v28.4s, v25.4s, v30.4s // t7a
sqsub v25.4s, v25.4s, v30.4s // t15a
+.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
+ smax_4s \r, \r, v7
+.endr
+
mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8
mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9
mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10
@@ -1135,6 +1212,13 @@ endfunc
sqadd v20.4s, v29.4s, v22.4s // t11a
sqsub v29.4s, v29.4s, v22.4s // t15a
+.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
+ smax_4s \r, \r, v7
+.endr
+
mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a
mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a
mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a
@@ -1163,24 +1247,34 @@ endfunc
sqadd \o15\().4s, v31.4s, v26.4s // out15
mov \o0\().16b, v4.16b
.endif
- sqneg \o15\().4s, \o15\().4s // out15
sqsub v3.4s, v29.4s, v18.4s // t15a
sqadd \o13\().4s, v29.4s, v18.4s // out13
sqadd \o2\().4s, v17.4s, v30.4s // out2
sqsub v26.4s, v17.4s, v30.4s // t14a
- sqneg \o13\().4s, \o13\().4s // out13
sqadd \o1\().4s, v19.4s, v27.4s // out1
sqsub v27.4s, v19.4s, v27.4s // t10
sqadd \o14\().4s, v28.4s, v20.4s // out14
sqsub v20.4s, v28.4s, v20.4s // t11
- sqneg \o1\().4s, \o1\().4s // out1
sqadd \o3\().4s, v22.4s, v24.4s // out3
sqsub v22.4s, v22.4s, v24.4s // t6
sqadd \o12\().4s, v25.4s, v23.4s // out12
sqsub v23.4s, v25.4s, v23.4s // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, v2, v21, v3, v26, v27, v20, v22, v23
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v21, v3, v26, v27, v20, v22, v23
+ smax_4s \r, \r, v7
+.endr
+
+ sqneg \o15\().4s, \o15\().4s // out15
+ sqneg \o13\().4s, \o13\().4s // out13
+ sqneg \o1\().4s, \o1\().4s // out1
sqneg \o3\().4s, \o3\().4s // out3
mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23)
@@ -1956,6 +2050,9 @@ function inv_dct32_odd_4s_x16_neon
ld1 {v0.4s, v1.4s}, [x16]
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
sqsub v2.4s, v16.4s, v24.4s // t17
sqadd v16.4s, v16.4s, v24.4s // t16
sqsub v3.4s, v31.4s, v23.4s // t30
@@ -1973,23 +2070,30 @@ function inv_dct32_odd_4s_x16_neon
sqadd v25.4s, v19.4s, v27.4s // t28
sqsub v19.4s, v19.4s, v27.4s // t29
- mul_mls v4, v3, v2, v1.s[0], v1.s[1] // -> t17a
+.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v1.s[0], v1.s[1] // -> t17a
mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a
mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a
- srshr v21.4s, v4.4s, #12 // t17a
+ srshr v21.4s, v7.4s, #12 // t17a
srshr v27.4s, v6.4s, #12 // t30a
neg v2.4s, v2.4s // -> t18a
- mul_mls v4, v19, v24, v1.s[0], v1.s[1] // -> t29a
+ mul_mls v7, v19, v24, v1.s[0], v1.s[1] // -> t29a
mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a
srshr v19.4s, v2.4s, #12 // t18a
- srshr v24.4s, v4.4s, #12 // t29a
+ srshr v24.4s, v7.4s, #12 // t29a
mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a
- mul_mla v4, v17, v20, v1.s[3], v1.s[2] // -> t22a
+ mul_mla v7, v17, v20, v1.s[3], v1.s[2] // -> t22a
srshr v22.4s, v6.4s, #12 // t21a
srshr v18.4s, v2.4s, #12 // t26a
- neg v4.4s, v4.4s // -> t22a
+ neg v7.4s, v7.4s // -> t22a
mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a
- srshr v17.4s, v4.4s, #12 // t22a
+ srshr v17.4s, v7.4s, #12 // t22a
srshr v20.4s, v6.4s, #12 // t25a
sqsub v2.4s, v27.4s, v24.4s // t29
@@ -2009,23 +2113,30 @@ function inv_dct32_odd_4s_x16_neon
sqsub v29.4s, v31.4s, v25.4s // t28a
sqadd v31.4s, v31.4s, v25.4s // t31a
- mul_mls v4, v2, v3, v0.s[2], v0.s[3] // -> t18a
+.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v2, v3, v0.s[2], v0.s[3] // -> t18a
mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a
mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19
- srshr v18.4s, v4.4s, #12 // t18a
+ srshr v18.4s, v7.4s, #12 // t18a
srshr v25.4s, v6.4s, #12 // t29a
- mul_mla v4, v29, v24, v0.s[3], v0.s[2] // -> t28
+ mul_mla v7, v29, v24, v0.s[3], v0.s[2] // -> t28
mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20
srshr v29.4s, v2.4s, #12 // t19
- srshr v24.4s, v4.4s, #12 // t28
+ srshr v24.4s, v7.4s, #12 // t28
neg v6.4s, v6.4s // -> t20
mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27
- mul_mla v4, v20, v28, v0.s[3], v0.s[2] // -> t21a
+ mul_mla v7, v20, v28, v0.s[3], v0.s[2] // -> t21a
srshr v26.4s, v6.4s, #12 // t20
srshr v19.4s, v2.4s, #12 // t27
- neg v4.4s, v4.4s // -> t21a
+ neg v7.4s, v7.4s // -> t21a
mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a
- srshr v20.4s, v4.4s, #12 // t21a
+ srshr v20.4s, v7.4s, #12 // t21a
srshr v28.4s, v6.4s, #12 // t26a
sqsub v2.4s, v16.4s, v30.4s // t23
@@ -2038,33 +2149,40 @@ function inv_dct32_odd_4s_x16_neon
sqsub v21.4s, v27.4s, v22.4s // t25a
sqsub v27.4s, v18.4s, v20.4s // t21
sqadd v18.4s, v18.4s, v20.4s // t18 = out18
- sqadd v4.4s, v29.4s, v26.4s // t19a = out19
+ sqadd v7.4s, v29.4s, v26.4s // t19a = out19
sqsub v26.4s, v29.4s, v26.4s // t20a
sqadd v29.4s, v25.4s, v28.4s // t29 = out29
sqsub v25.4s, v25.4s, v28.4s // t26
sqadd v28.4s, v24.4s, v19.4s // t28a = out28
sqsub v24.4s, v24.4s, v19.4s // t27a
- mov v19.16b, v4.16b // out19
+ mov v19.16b, v7.16b // out19
- mul_mls v4, v24, v26, v0.s[0], v0.s[0] // -> t20
+.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v24, v26, v0.s[0], v0.s[0] // -> t20
mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27
- srshr v20.4s, v4.4s, #12 // t20
+ srshr v20.4s, v7.4s, #12 // t20
srshr v22.4s, v6.4s, #12 // t27
- mul_mla v4, v25, v27, v0.s[0], v0.s[0] // -> t26a
+ mul_mla v7, v25, v27, v0.s[0], v0.s[0] // -> t26a
mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a
mov v27.16b, v22.16b // t27
- srshr v26.4s, v4.4s, #12 // t26a
+ srshr v26.4s, v7.4s, #12 // t26a
mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22
- mul_mla v4, v21, v23, v0.s[0], v0.s[0] // -> t25
+ mul_mla v7, v21, v23, v0.s[0], v0.s[0] // -> t25
srshr v21.4s, v6.4s, #12 // t21a
srshr v22.4s, v24.4s, #12 // t22
- srshr v25.4s, v4.4s, #12 // t25
+ srshr v25.4s, v7.4s, #12 // t25
- mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t23a
+ mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t23a
mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a
- srshr v23.4s, v4.4s, #12 // t23a
+ srshr v23.4s, v7.4s, #12 // t23a
srshr v24.4s, v6.4s, #12 // t24a
ret
@@ -2091,6 +2209,15 @@ function inv_txfm_horz\suffix\()_dct_32x4_neon
scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
.endif
bl inv_dct_4s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5
transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5
transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5
@@ -2786,13 +2913,20 @@ function inv_dct64_step1_neon
sqsub v30.4s, v23.4s, v22.4s // t62
sqadd v31.4s, v23.4s, v22.4s // t63
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a
- mul_mls v4, v29, v26, v0.s[1], v0.s[0] // -> t61a
+ mul_mls v7, v29, v26, v0.s[1], v0.s[0] // -> t61a
neg v2.4s, v2.4s // t34a
mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a
srshr v26.4s, v2.4s, #12 // t34a
mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a
- srshr v29.4s, v4.4s, #12 // t61a
+ srshr v29.4s, v7.4s, #12 // t61a
srshr v25.4s, v6.4s, #12 // t33a
srshr v30.4s, v2.4s, #12 // t62a
@@ -2805,11 +2939,18 @@ function inv_dct64_step1_neon
sqsub v21.4s, v30.4s, v29.4s // t61
sqadd v22.4s, v30.4s, v29.4s // t62
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smax_4s \r, \r, v4
+.endr
+
mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a
- mul_mls v4, v21, v18, v0.s[3], v0.s[2] // -> t34a
+ mul_mls v7, v21, v18, v0.s[3], v0.s[2] // -> t34a
mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60
srshr v21.4s, v2.4s, #12 // t61a
- srshr v18.4s, v4.4s, #12 // t34a
+ srshr v18.4s, v7.4s, #12 // t34a
mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35
srshr v20.4s, v6.4s, #12 // t60
srshr v19.4s, v2.4s, #12 // t35
@@ -2846,11 +2987,18 @@ function inv_dct64_step2_neon
sqadd v30.4s, v23.4s, v22.4s // t48
sqsub v31.4s, v23.4s, v22.4s // t55
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a
- mul_mls v4, v27, v25, v0.s[2], v0.s[3] // -> t39a
+ mul_mls v7, v27, v25, v0.s[2], v0.s[3] // -> t39a
mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a
srshr v25.4s, v2.4s, #12 // t56a
- srshr v27.4s, v4.4s, #12 // t39a
+ srshr v27.4s, v7.4s, #12 // t39a
neg v6.4s, v6.4s // t40a
mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a
srshr v31.4s, v6.4s, #12 // t40a
@@ -2865,11 +3013,18 @@ function inv_dct64_step2_neon
sqsub v21.4s, v25.4s, v28.4s // t55
sqadd v22.4s, v25.4s, v28.4s // t56
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smax_4s \r, \r, v4
+.endr
+
mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a
- mul_mla v4, v21, v18, v0.s[0], v0.s[0] // -> t55a
+ mul_mla v7, v21, v18, v0.s[0], v0.s[0] // -> t55a
mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47
srshr v18.4s, v2.4s, #12 // t40a
- srshr v21.4s, v4.4s, #12 // t55a
+ srshr v21.4s, v7.4s, #12 // t55a
mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48
srshr v19.4s, v6.4s, #12 // t47
srshr v20.4s, v2.4s, #12 // t48
@@ -2966,6 +3121,14 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
bl inv_dct_4s_x16_neon
+ // idct_16 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
store16 x6
movz16dup_if v0.2s, w16, #2896*8, \scale
@@ -2984,6 +3147,9 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
mov x9, #-16
+ movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v0.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
.macro store_addsub r0, r1, r2, r3
ld1 {v2.4s}, [x6], #16
ld1 {v3.4s}, [x6], #16
@@ -2992,16 +3158,32 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
ld1 {v4.4s}, [x6], #16
sqadd v7.4s, v3.4s, \r1
sqsub \r1, v3.4s, \r1
+ smin v6.4s, v6.4s, v1.4s
+ smin \r0, \r0, v1.4s
ld1 {v5.4s}, [x6], #16
sqadd v2.4s, v4.4s, \r2
sub x6, x6, #16*4
+ smax v6.4s, v6.4s, v0.4s
+ smax \r0, \r0, v0.4s
sqsub \r2, v4.4s, \r2
+ smin v7.4s, v7.4s, v1.4s
+ smin \r1, \r1, v1.4s
st1 {v6.4s}, [x6], #16
st1 {\r0}, [x10], x9
+ smin v2.4s, v2.4s, v1.4s
+ smin \r2, \r2, v1.4s
+ smax v7.4s, v7.4s, v0.4s
+ smax \r1, \r1, v0.4s
sqadd v3.4s, v5.4s, \r3
sqsub \r3, v5.4s, \r3
+ smax v2.4s, v2.4s, v0.4s
+ smax \r2, \r2, v0.4s
+ smin v3.4s, v3.4s, v1.4s
+ smin \r3, \r3, v1.4s
st1 {v7.4s}, [x6], #16
st1 {\r1}, [x10], x9
+ smax v3.4s, v3.4s, v0.4s
+ smax \r3, \r3, v0.4s
st1 {v2.4s}, [x6], #16
st1 {\r2}, [x10], x9
st1 {v3.4s}, [x6], #16
@@ -3016,6 +3198,8 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
add x6, x6, #4*4*16
movrel x17, idct64_coeffs
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
movz16dup_if v0.2s, w16, #2896*8, \scale
movi_if v7.4s, #0, \clear
add x9, x7, x8, lsl #4 // offset 16