diff options
author | Martin Storsjö <martin@martin.st> | 2020-02-01 15:33:49 +0300 |
---|---|---|
committer | Janne Grunau <janne-vlc@jannau.net> | 2020-02-11 00:27:54 +0300 |
commit | 0bad117eb0f97594a938f17ba05d3ca89ba81a9f (patch) | |
tree | 9c6ff7ad3c9b4a2f9f3b62dd3c6a0654e752a705 /src/arm | |
parent | 2e68c1f36e560af6fa05fcb77c9ae77a76cfef6a (diff) |
arm64: mc: Simplify avg/w_avg/mask by always using the w16 macro
This shortens the source by 40 lines, and gives a significant
speedup on A53, a small speedup on A72 and a very minor slowdown
for avg/w_avg on A73.
Before: Cortex A53 A72 A73
avg_w4_8bpc_neon: 67.4 26.1 25.4
avg_w8_8bpc_neon: 158.7 56.3 59.1
avg_w16_8bpc_neon: 382.9 154.1 160.7
w_avg_w4_8bpc_neon: 99.9 43.6 39.4
w_avg_w8_8bpc_neon: 253.2 98.3 99.0
w_avg_w16_8bpc_neon: 543.1 285.0 301.8
mask_w4_8bpc_neon: 110.6 51.4 45.1
mask_w8_8bpc_neon: 295.0 129.9 114.0
mask_w16_8bpc_neon: 654.6 365.8 369.7
After:
avg_w4_8bpc_neon: 60.8 26.3 29.0
avg_w8_8bpc_neon: 142.8 52.9 64.1
avg_w16_8bpc_neon: 378.2 153.4 160.8
w_avg_w4_8bpc_neon: 78.7 41.0 40.9
w_avg_w8_8bpc_neon: 190.6 90.1 105.1
w_avg_w16_8bpc_neon: 531.1 279.3 301.4
mask_w4_8bpc_neon: 86.6 47.2 44.9
mask_w8_8bpc_neon: 222.0 114.3 114.9
mask_w16_8bpc_neon: 639.5 356.0 369.8
Diffstat (limited to 'src/arm')
-rw-r--r-- | src/arm/64/mc.S | 135 |
1 files changed, 48 insertions, 87 deletions
diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index ea10fc7..9b27a56 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -29,14 +29,7 @@ #include "src/arm/asm.S" #include "util.S" -.macro avg dst, t0, t1 - ld1 {\t0\().8h}, [x2], 16 - ld1 {\t1\().8h}, [x3], 16 - add \t0\().8h, \t0\().8h, \t1\().8h - sqrshrun \dst\().8b, \t0\().8h, #5 -.endm - -.macro avg16 dst, t0, t1, t2, t3 +.macro avg dst, t0, t1, t2, t3 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 add \t0\().8h, \t0\().8h, \t2\().8h @@ -45,16 +38,7 @@ sqrshrun2 \dst\().16b, \t1\().8h, #5 .endm -.macro w_avg dst, t0, t1 - ld1 {\t0\().8h}, [x2], 16 - ld1 {\t1\().8h}, [x3], 16 - sub \t0\().8h, \t1\().8h, \t0\().8h - sqdmulh \t0\().8h, \t0\().8h, v30.8h - add \t0\().8h, \t1\().8h, \t0\().8h - sqrshrun \dst\().8b, \t0\().8h, #4 -.endm - -.macro w_avg16 dst, t0, t1, t2, t3 +.macro w_avg dst, t0, t1, t2, t3 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 sub \t0\().8h, \t2\().8h, \t0\().8h @@ -67,19 +51,7 @@ sqrshrun2 \dst\().16b, \t1\().8h, #4 .endm -.macro mask dst, t0, t1 - ld1 {v30.8b}, [x6], 8 - ld1 {\t0\().8h}, [x2], 16 - mul v30.8b, v30.8b, v31.8b - ld1 {\t1\().8h}, [x3], 16 - shll v30.8h, v30.8b, #8 - sub \t0\().8h, \t1\().8h, \t0\().8h - sqdmulh \t0\().8h, \t0\().8h, v30.8h - add \t0\().8h, \t1\().8h, \t0\().8h - sqrshrun \dst\().8b, \t0\().8h, #4 -.endm - -.macro mask16 dst, t0, t1, t2, t3 +.macro mask dst, t0, t1, t2, t3 ld1 {v30.16b}, [x6], 16 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 mul v30.16b, v30.16b, v31.16b @@ -109,113 +81,102 @@ function \type\()_8bpc_neon, export=1 .endif adr x7, L(\type\()_tbl) sub w4, w4, #24 - \type v4, v0, v1 ldrh w4, [x7, x4, lsl #1] - \type v5, v2, v3 + \type v4, v0, v1, v2, v3 sub x7, x7, w4, uxtw br x7 4: cmp w5, #4 st1 {v4.s}[0], [x0], x1 st1 {v4.s}[1], [x0], x1 - st1 {v5.s}[0], [x0], x1 - st1 {v5.s}[1], [x0], x1 + st1 {v4.s}[2], [x0], x1 + st1 {v4.s}[3], [x0], x1 b.eq 0f - \type v6, v0, v1 - \type v7, v2, v3 + \type v5, v0, v1, v2, v3 cmp w5, #8 - st1 {v6.s}[0], [x0], x1 - st1 {v6.s}[1], [x0], x1 - st1 {v7.s}[0], [x0], x1 - st1 {v7.s}[1], [x0], x1 + st1 {v5.s}[0], [x0], x1 + st1 {v5.s}[1], [x0], x1 + st1 {v5.s}[2], [x0], x1 + st1 {v5.s}[3], [x0], x1 b.eq 0f - \type v4, v0, v1 - \type v5, v2, v3 + \type v4, v0, v1, v2, v3 st1 {v4.s}[0], [x0], x1 st1 {v4.s}[1], [x0], x1 - \type v6, v0, v1 + \type v5, v0, v1, v2, v3 + st1 {v4.s}[2], [x0], x1 + st1 {v4.s}[3], [x0], x1 st1 {v5.s}[0], [x0], x1 st1 {v5.s}[1], [x0], x1 - \type v7, v2, v3 - st1 {v6.s}[0], [x0], x1 - st1 {v6.s}[1], [x0], x1 - st1 {v7.s}[0], [x0], x1 - st1 {v7.s}[1], [x0], x1 + st1 {v5.s}[2], [x0], x1 + st1 {v5.s}[3], [x0], x1 ret 8: - st1 {v4.8b}, [x0], x1 - \type v6, v0, v1 - st1 {v5.8b}, [x0], x1 - \type v7, v0, v1 - st1 {v6.8b}, [x0], x1 + st1 {v4.d}[0], [x0], x1 + \type v5, v0, v1, v2, v3 + st1 {v4.d}[1], [x0], x1 + st1 {v5.d}[0], [x0], x1 subs w5, w5, #4 - st1 {v7.8b}, [x0], x1 + st1 {v5.d}[1], [x0], x1 b.le 0f - \type v4, v0, v1 - \type v5, v2, v3 + \type v4, v0, v1, v2, v3 b 8b -160: - trn1 v4.2d, v4.2d, v5.2d 16: - \type\()16 v5, v0, v1, v2, v3 + \type v5, v0, v1, v2, v3 st1 {v4.16b}, [x0], x1 - \type\()16 v6, v0, v1, v2, v3 + \type v6, v0, v1, v2, v3 st1 {v5.16b}, [x0], x1 - \type\()16 v7, v0, v1, v2, v3 + \type v7, v0, v1, v2, v3 st1 {v6.16b}, [x0], x1 subs w5, w5, #4 st1 {v7.16b}, [x0], x1 b.le 0f - \type\()16 v4, v0, v1, v2, v3 + \type v4, v0, v1, v2, v3 b 16b 320: - trn1 v4.2d, v4.2d, v5.2d add x7, x0, x1 lsl x1, x1, #1 32: - \type\()16 v5, v0, v1, v2, v3 - \type\()16 v6, v0, v1, v2, v3 + \type v5, v0, v1, v2, v3 + \type v6, v0, v1, v2, v3 st1 {v4.16b,v5.16b}, [x0], x1 - \type\()16 v7, v0, v1, v2, v3 + \type v7, v0, v1, v2, v3 subs w5, w5, #2 st1 {v6.16b,v7.16b}, [x7], x1 b.le 0f - \type\()16 v4, v0, v1, v2, v3 + \type v4, v0, v1, v2, v3 b 32b 640: - trn1 v4.2d, v4.2d, v5.2d add x7, x0, x1 lsl x1, x1, #1 64: - \type\()16 v5, v0, v1, v2, v3 - \type\()16 v6, v0, v1, v2, v3 - \type\()16 v7, v0, v1, v2, v3 - \type\()16 v16, v0, v1, v2, v3 - \type\()16 v17, v0, v1, v2, v3 + \type v5, v0, v1, v2, v3 + \type v6, v0, v1, v2, v3 + \type v7, v0, v1, v2, v3 + \type v16, v0, v1, v2, v3 + \type v17, v0, v1, v2, v3 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 - \type\()16 v18, v0, v1, v2, v3 - \type\()16 v19, v0, v1, v2, v3 + \type v18, v0, v1, v2, v3 + \type v19, v0, v1, v2, v3 subs w5, w5, #2 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 b.le 0f - \type\()16 v4, v0, v1, v2, v3 + \type v4, v0, v1, v2, v3 b 64b 1280: - trn1 v4.2d, v4.2d, v5.2d add x7, x0, #64 128: - \type\()16 v5, v0, v1, v2, v3 - \type\()16 v6, v0, v1, v2, v3 - \type\()16 v7, v0, v1, v2, v3 - \type\()16 v16, v0, v1, v2, v3 - \type\()16 v17, v0, v1, v2, v3 + \type v5, v0, v1, v2, v3 + \type v6, v0, v1, v2, v3 + \type v7, v0, v1, v2, v3 + \type v16, v0, v1, v2, v3 + \type v17, v0, v1, v2, v3 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 - \type\()16 v18, v0, v1, v2, v3 - \type\()16 v19, v0, v1, v2, v3 + \type v18, v0, v1, v2, v3 + \type v19, v0, v1, v2, v3 subs w5, w5, #1 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 b.le 0f - \type\()16 v4, v0, v1, v2, v3 + \type v4, v0, v1, v2, v3 b 128b 0: ret @@ -223,7 +184,7 @@ L(\type\()_tbl): .hword L(\type\()_tbl) - 1280b .hword L(\type\()_tbl) - 640b .hword L(\type\()_tbl) - 320b - .hword L(\type\()_tbl) - 160b + .hword L(\type\()_tbl) - 16b .hword L(\type\()_tbl) - 8b .hword L(\type\()_tbl) - 4b endfunc |