Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2019-05-08 23:26:27 +0300
committerJean-Baptiste Kempf <jb@videolan.org>2019-06-26 11:34:39 +0300
commitef1ea0083c061a86e578551d0b8409548a549eb2 (patch)
tree1d9d50d6fd37775a83869807f3d9ba1f8c25feeb /src/arm/64/util.S
parente034611481726b74bab496907bb0675942be4d04 (diff)
arm64: itx: Add NEON optimized inverse transforms
The speedup for most non-dc-only dct functions is around 9-12x over the C code generated by GCC 7.3. Relative speedups vs C for a few functions: Cortex A53 A72 A73 inv_txfm_add_4x4_dct_dct_0_8bpc_neon: 3.90 4.16 5.65 inv_txfm_add_4x4_dct_dct_1_8bpc_neon: 7.20 8.05 11.19 inv_txfm_add_8x8_dct_dct_0_8bpc_neon: 5.09 6.73 6.45 inv_txfm_add_8x8_dct_dct_1_8bpc_neon: 12.18 10.80 13.05 inv_txfm_add_16x16_dct_dct_0_8bpc_neon: 7.31 9.35 11.17 inv_txfm_add_16x16_dct_dct_1_8bpc_neon: 14.36 13.06 15.93 inv_txfm_add_16x16_dct_dct_2_8bpc_neon: 11.00 10.09 12.05 inv_txfm_add_32x32_dct_dct_0_8bpc_neon: 4.41 5.40 5.77 inv_txfm_add_32x32_dct_dct_1_8bpc_neon: 13.84 13.81 18.04 inv_txfm_add_32x32_dct_dct_2_8bpc_neon: 11.75 11.87 15.22 inv_txfm_add_32x32_dct_dct_3_8bpc_neon: 10.20 10.40 13.13 inv_txfm_add_32x32_dct_dct_4_8bpc_neon: 9.01 9.21 11.56 inv_txfm_add_64x64_dct_dct_0_8bpc_neon: 3.84 4.82 5.28 inv_txfm_add_64x64_dct_dct_1_8bpc_neon: 14.40 12.69 16.71 inv_txfm_add_64x64_dct_dct_4_8bpc_neon: 10.91 9.63 12.67 Some of the specialcased identity_identity transforms for 32x32 give insane speedups over the generic C code: inv_txfm_add_32x32_identity_identity_0_8bpc_neon: 225.26 238.11 247.07 inv_txfm_add_32x32_identity_identity_1_8bpc_neon: 225.33 238.53 247.69 inv_txfm_add_32x32_identity_identity_2_8bpc_neon: 59.60 61.94 64.63 inv_txfm_add_32x32_identity_identity_3_8bpc_neon: 26.98 27.99 29.21 inv_txfm_add_32x32_identity_identity_4_8bpc_neon: 15.08 15.93 16.56
Diffstat (limited to 'src/arm/64/util.S')
-rw-r--r--src/arm/64/util.S53
1 files changed, 53 insertions, 0 deletions
diff --git a/src/arm/64/util.S b/src/arm/64/util.S
index 91c7f20..3332c85 100644
--- a/src/arm/64/util.S
+++ b/src/arm/64/util.S
@@ -88,6 +88,35 @@
trn2 \r7\().2s, \t9\().2s, \r7\().2s
.endm
+.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
+ trn1 \t8\().8h, \r0\().8h, \r1\().8h
+ trn2 \t9\().8h, \r0\().8h, \r1\().8h
+ trn1 \r1\().8h, \r2\().8h, \r3\().8h
+ trn2 \r3\().8h, \r2\().8h, \r3\().8h
+ trn1 \r0\().8h, \r4\().8h, \r5\().8h
+ trn2 \r5\().8h, \r4\().8h, \r5\().8h
+ trn1 \r2\().8h, \r6\().8h, \r7\().8h
+ trn2 \r7\().8h, \r6\().8h, \r7\().8h
+
+ trn1 \r4\().4s, \r0\().4s, \r2\().4s
+ trn2 \r2\().4s, \r0\().4s, \r2\().4s
+ trn1 \r6\().4s, \r5\().4s, \r7\().4s
+ trn2 \r7\().4s, \r5\().4s, \r7\().4s
+ trn1 \r5\().4s, \t9\().4s, \r3\().4s
+ trn2 \t9\().4s, \t9\().4s, \r3\().4s
+ trn1 \r3\().4s, \t8\().4s, \r1\().4s
+ trn2 \t8\().4s, \t8\().4s, \r1\().4s
+
+ trn1 \r0\().2d, \r3\().2d, \r4\().2d
+ trn2 \r4\().2d, \r3\().2d, \r4\().2d
+ trn1 \r1\().2d, \r5\().2d, \r6\().2d
+ trn2 \r5\().2d, \r5\().2d, \r6\().2d
+ trn2 \r6\().2d, \t8\().2d, \r2\().2d
+ trn1 \r2\().2d, \t8\().2d, \r2\().2d
+ trn1 \r3\().2d, \t9\().2d, \r7\().2d
+ trn2 \r7\().2d, \t9\().2d, \r7\().2d
+.endm
+
.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
trn1 \t8\().16b, \r0\().16b, \r1\().16b
trn2 \t9\().16b, \r0\().16b, \r1\().16b
@@ -129,4 +158,28 @@
trn2 \r3\().8h, \t5\().8h, \t7\().8h
.endm
+.macro transpose_4x4h r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().4h, \r0\().4h, \r1\().4h
+ trn2 \t5\().4h, \r0\().4h, \r1\().4h
+ trn1 \t6\().4h, \r2\().4h, \r3\().4h
+ trn2 \t7\().4h, \r2\().4h, \r3\().4h
+
+ trn1 \r0\().2s, \t4\().2s, \t6\().2s
+ trn2 \r2\().2s, \t4\().2s, \t6\().2s
+ trn1 \r1\().2s, \t5\().2s, \t7\().2s
+ trn2 \r3\().2s, \t5\().2s, \t7\().2s
+.endm
+
+.macro transpose_4x8h r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().8h, \r0\().8h, \r1\().8h
+ trn2 \t5\().8h, \r0\().8h, \r1\().8h
+ trn1 \t6\().8h, \r2\().8h, \r3\().8h
+ trn2 \t7\().8h, \r2\().8h, \r3\().8h
+
+ trn1 \r0\().4s, \t4\().4s, \t6\().4s
+ trn2 \r2\().4s, \t4\().4s, \t6\().4s
+ trn1 \r1\().4s, \t5\().4s, \t7\().4s
+ trn2 \r3\().4s, \t5\().4s, \t7\().4s
+.endm
+
#endif /* DAV1D_SRC_ARM_64_UTIL_S */