From 52e9b4353f968fd27e2bd912b0e2302509063068 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Wed, 4 Mar 2020 10:51:50 +0200 Subject: arm: mc: Optimize blend_v Use a post-increment with a register on the last increment, avoiding a separate increment. Avoid processing the last 8 pixels in the w32 case when we only output 24 pixels. Before: ARM32 Cortex A7 A8 A9 A53 A72 A73 blend_v_w4_8bpc_neon: 450.4 574.7 538.7 374.6 199.3 260.5 blend_v_w8_8bpc_neon: 559.6 351.3 552.5 357.6 214.8 204.3 blend_v_w16_8bpc_neon: 926.3 511.6 787.9 593.0 271.0 246.8 blend_v_w32_8bpc_neon: 1482.5 917.0 1149.5 991.9 354.0 368.9 ARM64 blend_v_w4_8bpc_neon: 351.1 200.0 224.1 blend_v_w8_8bpc_neon: 333.0 212.4 203.8 blend_v_w16_8bpc_neon: 495.2 302.0 247.0 blend_v_w32_8bpc_neon: 840.0 557.8 514.0 After: ARM32 blend_v_w4_8bpc_neon: 435.5 575.8 537.6 356.2 198.3 259.5 blend_v_w8_8bpc_neon: 545.2 347.9 553.5 339.1 207.8 204.2 blend_v_w16_8bpc_neon: 913.7 511.0 788.1 573.7 275.4 243.3 blend_v_w32_8bpc_neon: 1445.3 951.2 1079.1 920.4 352.2 361.6 ARM64 blend_v_w4_8bpc_neon: 333.0 191.3 225.9 blend_v_w8_8bpc_neon: 314.9 199.3 203.5 blend_v_w16_8bpc_neon: 476.9 301.3 241.1 blend_v_w32_8bpc_neon: 766.9 432.8 416.9 --- src/arm/32/mc.S | 28 +++++++++++----------------- src/arm/64/mc.S | 40 +++++++++++++--------------------------- 2 files changed, 24 insertions(+), 44 deletions(-) (limited to 'src/arm') diff --git a/src/arm/32/mc.S b/src/arm/32/mc.S index ece2057..36f6c2e 100644 --- a/src/arm/32/mc.S +++ b/src/arm/32/mc.S @@ -753,7 +753,7 @@ L(blend_v_tbl): add r12, r0, r1 lsl r1, r1, #1 vsub.i8 d5, d22, d4 - sub r1, r1, #3 + sub r1, r1, #2 4: vld1.u8 {d2}, [r2, :64]! vld1.32 {d0[]}, [r0, :32] @@ -764,10 +764,8 @@ L(blend_v_tbl): vrshrn.i16 d20, q3, #6 vst1.16 {d20[0]}, [r0, :16]! vst1.16 {d20[2]}, [r12, :16]! - vst1.8 {d20[2]}, [r0]! - vst1.8 {d20[6]}, [r12]! - add r0, r0, r1 - add r12, r12, r1 + vst1.8 {d20[2]}, [r0], r1 + vst1.8 {d20[6]}, [r12], r1 bgt 4b pop {r4-r5,pc} 80: @@ -776,7 +774,7 @@ L(blend_v_tbl): add r12, r0, r1 lsl r1, r1, #1 vsub.i8 d17, d16, d2 - sub r1, r1, #6 + sub r1, r1, #4 8: vld1.u8 {d4, d5}, [r2, :128]! vld1.u8 {d0}, [r0, :64] @@ -790,10 +788,8 @@ L(blend_v_tbl): vrshrn.i16 d23, q10, #6 vst1.32 {d22[0]}, [r0, :32]! vst1.32 {d23[0]}, [r12, :32]! - vst1.16 {d22[2]}, [r0, :16]! - vst1.16 {d23[2]}, [r12, :16]! - add r0, r0, r1 - add r12, r12, r1 + vst1.16 {d22[2]}, [r0, :16], r1 + vst1.16 {d23[2]}, [r12, :16], r1 bgt 8b pop {r4-r5,pc} 160: @@ -802,7 +798,7 @@ L(blend_v_tbl): add r12, r0, r1 lsl r1, r1, #1 vsub.i8 q11, q12, q14 - sub r1, r1, #12 + sub r1, r1, #8 16: vld1.u8 {q1, q2}, [r2, :128]! vld1.u8 {q0}, [r0, :128] @@ -822,20 +818,18 @@ L(blend_v_tbl): vrshrn.i16 d21, q8, #6 vst1.u8 {d18}, [r0, :64]! vst1.u8 {d20}, [r12, :64]! - vst1.32 {d19[0]}, [r0, :32]! - vst1.32 {d21[0]}, [r12, :32]! - add r0, r0, r1 - add r12, r12, r1 + vst1.32 {d19[0]}, [r0, :32], r1 + vst1.32 {d21[0]}, [r12, :32], r1 bgt 16b pop {r4-r5,pc} 320: vmov.i8 q10, #64 vld1.u8 {q2, q3}, [r5, :128] vsub.i8 q11, q10, q2 - vsub.i8 q12, q10, q3 + vsub.i8 d24, d20, d6 32: vld1.u8 {q8, q9}, [r2, :128]! - vld1.u8 {q0, q1}, [r0, :128] + vld1.u8 {d0, d1, d2}, [r0, :64] subs r4, r4, #1 vmull.u8 q15, d16, d4 vmlal.u8 q15, d0, d22 diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index 0f2d834..92aa8aa 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -709,8 +709,8 @@ function blend_v_8bpc_neon, export=1 ret 40: ld1r {v0.2s}, [x5] + sub x1, x1, #2 sub v1.8b, v4.8b, v0.8b - sub x1, x1, #3 4: ld1 {v2.8b}, [x2], #8 ld1 {v3.s}[0], [x0] @@ -721,16 +721,14 @@ function blend_v_8bpc_neon, export=1 rshrn v5.8b, v5.8h, #6 st1 {v5.h}[0], [x0], #2 st1 {v5.h}[2], [x8], #2 - st1 {v5.b}[2], [x0], #1 - st1 {v5.b}[6], [x8], #1 - add x0, x0, x1 - add x8, x8, x1 + st1 {v5.b}[2], [x0], x1 + st1 {v5.b}[6], [x8], x1 b.gt 4b ret 80: ld1r {v0.2d}, [x5] + sub x1, x1, #4 sub v1.16b, v4.16b, v0.16b - sub x1, x1, #6 8: ld1 {v2.16b}, [x2], #16 ld1 {v3.d}[0], [x0] @@ -744,16 +742,14 @@ function blend_v_8bpc_neon, export=1 rshrn2 v7.16b, v6.8h, #6 st1 {v7.s}[0], [x0], #4 st1 {v7.s}[2], [x8], #4 - st1 {v7.h}[2], [x0], #2 - st1 {v7.h}[6], [x8], #2 - add x0, x0, x1 - add x8, x8, x1 + st1 {v7.h}[2], [x0], x1 + st1 {v7.h}[6], [x8], x1 b.gt 8b ret 160: ld1 {v0.16b}, [x5] + sub x1, x1, #8 sub v2.16b, v4.16b, v0.16b - sub x1, x1, #12 16: ld1 {v5.16b, v6.16b}, [x2], #32 ld1 {v7.16b}, [x0] @@ -773,17 +769,15 @@ function blend_v_8bpc_neon, export=1 rshrn2 v22.16b, v21.8h, #6 st1 {v19.8b}, [x0], #8 st1 {v22.8b}, [x8], #8 - st1 {v19.s}[2], [x0], #4 - st1 {v22.s}[2], [x8], #4 - add x0, x0, x1 - add x8, x8, x1 + st1 {v19.s}[2], [x0], x1 + st1 {v22.s}[2], [x8], x1 b.gt 16b ret 320: ld1 {v0.16b, v1.16b}, [x5] + sub x1, x1, #16 sub v2.16b, v4.16b, v0.16b - sub v3.16b, v4.16b, v1.16b - sub x1, x1, #24 + sub v3.8b, v4.8b, v1.8b 32: ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 ld1 {v5.16b, v6.16b}, [x0] @@ -795,30 +789,22 @@ function blend_v_8bpc_neon, export=1 umlal2 v23.8h, v5.16b, v2.16b umull v28.8h, v17.8b, v1.8b umlal v28.8h, v6.8b, v3.8b - umull2 v29.8h, v17.16b, v1.16b - umlal2 v29.8h, v6.16b, v3.16b umull v30.8h, v18.8b, v0.8b umlal v30.8h, v20.8b, v2.8b umull2 v31.8h, v18.16b, v0.16b umlal2 v31.8h, v20.16b, v2.16b umull v25.8h, v19.8b, v1.8b umlal v25.8h, v21.8b, v3.8b - umull2 v26.8h, v19.16b, v1.16b - umlal2 v26.8h, v21.16b, v3.16b rshrn v24.8b, v22.8h, #6 rshrn2 v24.16b, v23.8h, #6 rshrn v28.8b, v28.8h, #6 - rshrn2 v28.16b, v29.8h, #6 rshrn v30.8b, v30.8h, #6 rshrn2 v30.16b, v31.8h, #6 rshrn v27.8b, v25.8h, #6 - rshrn2 v27.16b, v26.8h, #6 st1 {v24.16b}, [x0], #16 st1 {v30.16b}, [x8], #16 - st1 {v28.8b}, [x0], #8 - st1 {v27.8b}, [x8], #8 - add x0, x0, x1 - add x8, x8, x1 + st1 {v28.8b}, [x0], x1 + st1 {v27.8b}, [x8], x1 b.gt 32b ret L(blend_v_tbl): -- cgit v1.2.3