From 0bad117eb0f97594a938f17ba05d3ca89ba81a9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 1 Feb 2020 14:33:49 +0200 Subject: arm64: mc: Simplify avg/w_avg/mask by always using the w16 macro This shortens the source by 40 lines, and gives a significant speedup on A53, a small speedup on A72 and a very minor slowdown for avg/w_avg on A73. Before: Cortex A53 A72 A73 avg_w4_8bpc_neon: 67.4 26.1 25.4 avg_w8_8bpc_neon: 158.7 56.3 59.1 avg_w16_8bpc_neon: 382.9 154.1 160.7 w_avg_w4_8bpc_neon: 99.9 43.6 39.4 w_avg_w8_8bpc_neon: 253.2 98.3 99.0 w_avg_w16_8bpc_neon: 543.1 285.0 301.8 mask_w4_8bpc_neon: 110.6 51.4 45.1 mask_w8_8bpc_neon: 295.0 129.9 114.0 mask_w16_8bpc_neon: 654.6 365.8 369.7 After: avg_w4_8bpc_neon: 60.8 26.3 29.0 avg_w8_8bpc_neon: 142.8 52.9 64.1 avg_w16_8bpc_neon: 378.2 153.4 160.8 w_avg_w4_8bpc_neon: 78.7 41.0 40.9 w_avg_w8_8bpc_neon: 190.6 90.1 105.1 w_avg_w16_8bpc_neon: 531.1 279.3 301.4 mask_w4_8bpc_neon: 86.6 47.2 44.9 mask_w8_8bpc_neon: 222.0 114.3 114.9 mask_w16_8bpc_neon: 639.5 356.0 369.8 --- src/arm/64/mc.S | 135 ++++++++++++++++++++------------------------------------ 1 file changed, 48 insertions(+), 87 deletions(-) (limited to 'src/arm') diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index ea10fc7..9b27a56 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -29,14 +29,7 @@ #include "src/arm/asm.S" #include "util.S" -.macro avg dst, t0, t1 - ld1 {\t0\().8h}, [x2], 16 - ld1 {\t1\().8h}, [x3], 16 - add \t0\().8h, \t0\().8h, \t1\().8h - sqrshrun \dst\().8b, \t0\().8h, #5 -.endm - -.macro avg16 dst, t0, t1, t2, t3 +.macro avg dst, t0, t1, t2, t3 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 add \t0\().8h, \t0\().8h, \t2\().8h @@ -45,16 +38,7 @@ sqrshrun2 \dst\().16b, \t1\().8h, #5 .endm -.macro w_avg dst, t0, t1 - ld1 {\t0\().8h}, [x2], 16 - ld1 {\t1\().8h}, [x3], 16 - sub \t0\().8h, \t1\().8h, \t0\().8h - sqdmulh \t0\().8h, \t0\().8h, v30.8h - add \t0\().8h, \t1\().8h, \t0\().8h - sqrshrun \dst\().8b, \t0\().8h, #4 -.endm - -.macro w_avg16 dst, t0, t1, t2, t3 +.macro w_avg dst, t0, t1, t2, t3 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 sub \t0\().8h, \t2\().8h, \t0\().8h @@ -67,19 +51,7 @@ sqrshrun2 \dst\().16b, \t1\().8h, #4 .endm -.macro mask dst, t0, t1 - ld1 {v30.8b}, [x6], 8 - ld1 {\t0\().8h}, [x2], 16 - mul v30.8b, v30.8b, v31.8b - ld1 {\t1\().8h}, [x3], 16 - shll v30.8h, v30.8b, #8 - sub \t0\().8h, \t1\().8h, \t0\().8h - sqdmulh \t0\().8h, \t0\().8h, v30.8h - add \t0\().8h, \t1\().8h, \t0\().8h - sqrshrun \dst\().8b, \t0\().8h, #4 -.endm - -.macro mask16 dst, t0, t1, t2, t3 +.macro mask dst, t0, t1, t2, t3 ld1 {v30.16b}, [x6], 16 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 mul v30.16b, v30.16b, v31.16b @@ -109,113 +81,102 @@ function \type\()_8bpc_neon, export=1 .endif adr x7, L(\type\()_tbl) sub w4, w4, #24 - \type v4, v0, v1 ldrh w4, [x7, x4, lsl #1] - \type v5, v2, v3 + \type v4, v0, v1, v2, v3 sub x7, x7, w4, uxtw br x7 4: cmp w5, #4 st1 {v4.s}[0], [x0], x1 st1 {v4.s}[1], [x0], x1 - st1 {v5.s}[0], [x0], x1 - st1 {v5.s}[1], [x0], x1 + st1 {v4.s}[2], [x0], x1 + st1 {v4.s}[3], [x0], x1 b.eq 0f - \type v6, v0, v1 - \type v7, v2, v3 + \type v5, v0, v1, v2, v3 cmp w5, #8 - st1 {v6.s}[0], [x0], x1 - st1 {v6.s}[1], [x0], x1 - st1 {v7.s}[0], [x0], x1 - st1 {v7.s}[1], [x0], x1 + st1 {v5.s}[0], [x0], x1 + st1 {v5.s}[1], [x0], x1 + st1 {v5.s}[2], [x0], x1 + st1 {v5.s}[3], [x0], x1 b.eq 0f - \type v4, v0, v1 - \type v5, v2, v3 + \type v4, v0, v1, v2, v3 st1 {v4.s}[0], [x0], x1 st1 {v4.s}[1], [x0], x1 - \type v6, v0, v1 + \type v5, v0, v1, v2, v3 + st1 {v4.s}[2], [x0], x1 + st1 {v4.s}[3], [x0], x1 st1 {v5.s}[0], [x0], x1 st1 {v5.s}[1], [x0], x1 - \type v7, v2, v3 - st1 {v6.s}[0], [x0], x1 - st1 {v6.s}[1], [x0], x1 - st1 {v7.s}[0], [x0], x1 - st1 {v7.s}[1], [x0], x1 + st1 {v5.s}[2], [x0], x1 + st1 {v5.s}[3], [x0], x1 ret 8: - st1 {v4.8b}, [x0], x1 - \type v6, v0, v1 - st1 {v5.8b}, [x0], x1 - \type v7, v0, v1 - st1 {v6.8b}, [x0], x1 + st1 {v4.d}[0], [x0], x1 + \type v5, v0, v1, v2, v3 + st1 {v4.d}[1], [x0], x1 + st1 {v5.d}[0], [x0], x1 subs w5, w5, #4 - st1 {v7.8b}, [x0], x1 + st1 {v5.d}[1], [x0], x1 b.le 0f - \type v4, v0, v1 - \type v5, v2, v3 + \type v4, v0, v1, v2, v3 b 8b -160: - trn1 v4.2d, v4.2d, v5.2d 16: - \type\()16 v5, v0, v1, v2, v3 + \type v5, v0, v1, v2, v3 st1 {v4.16b}, [x0], x1 - \type\()16 v6, v0, v1, v2, v3 + \type v6, v0, v1, v2, v3 st1 {v5.16b}, [x0], x1 - \type\()16 v7, v0, v1, v2, v3 + \type v7, v0, v1, v2, v3 st1 {v6.16b}, [x0], x1 subs w5, w5, #4 st1 {v7.16b}, [x0], x1 b.le 0f - \type\()16 v4, v0, v1, v2, v3 + \type v4, v0, v1, v2, v3 b 16b 320: - trn1 v4.2d, v4.2d, v5.2d add x7, x0, x1 lsl x1, x1, #1 32: - \type\()16 v5, v0, v1, v2, v3 - \type\()16 v6, v0, v1, v2, v3 + \type v5, v0, v1, v2, v3 + \type v6, v0, v1, v2, v3 st1 {v4.16b,v5.16b}, [x0], x1 - \type\()16 v7, v0, v1, v2, v3 + \type v7, v0, v1, v2, v3 subs w5, w5, #2 st1 {v6.16b,v7.16b}, [x7], x1 b.le 0f - \type\()16 v4, v0, v1, v2, v3 + \type v4, v0, v1, v2, v3 b 32b 640: - trn1 v4.2d, v4.2d, v5.2d add x7, x0, x1 lsl x1, x1, #1 64: - \type\()16 v5, v0, v1, v2, v3 - \type\()16 v6, v0, v1, v2, v3 - \type\()16 v7, v0, v1, v2, v3 - \type\()16 v16, v0, v1, v2, v3 - \type\()16 v17, v0, v1, v2, v3 + \type v5, v0, v1, v2, v3 + \type v6, v0, v1, v2, v3 + \type v7, v0, v1, v2, v3 + \type v16, v0, v1, v2, v3 + \type v17, v0, v1, v2, v3 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 - \type\()16 v18, v0, v1, v2, v3 - \type\()16 v19, v0, v1, v2, v3 + \type v18, v0, v1, v2, v3 + \type v19, v0, v1, v2, v3 subs w5, w5, #2 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 b.le 0f - \type\()16 v4, v0, v1, v2, v3 + \type v4, v0, v1, v2, v3 b 64b 1280: - trn1 v4.2d, v4.2d, v5.2d add x7, x0, #64 128: - \type\()16 v5, v0, v1, v2, v3 - \type\()16 v6, v0, v1, v2, v3 - \type\()16 v7, v0, v1, v2, v3 - \type\()16 v16, v0, v1, v2, v3 - \type\()16 v17, v0, v1, v2, v3 + \type v5, v0, v1, v2, v3 + \type v6, v0, v1, v2, v3 + \type v7, v0, v1, v2, v3 + \type v16, v0, v1, v2, v3 + \type v17, v0, v1, v2, v3 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 - \type\()16 v18, v0, v1, v2, v3 - \type\()16 v19, v0, v1, v2, v3 + \type v18, v0, v1, v2, v3 + \type v19, v0, v1, v2, v3 subs w5, w5, #1 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 b.le 0f - \type\()16 v4, v0, v1, v2, v3 + \type v4, v0, v1, v2, v3 b 128b 0: ret @@ -223,7 +184,7 @@ L(\type\()_tbl): .hword L(\type\()_tbl) - 1280b .hword L(\type\()_tbl) - 640b .hword L(\type\()_tbl) - 320b - .hword L(\type\()_tbl) - 160b + .hword L(\type\()_tbl) - 16b .hword L(\type\()_tbl) - 8b .hword L(\type\()_tbl) - 4b endfunc -- cgit v1.2.3