From ea54dbe2a89d3eb4edbdbbf1810180984467c6aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Wed, 1 Apr 2020 23:56:34 +0300 Subject: arm64: mc: NEON implementation of emu_edge for 8bpc Relative speedups over C code: Cortex A53 A72 A73 emu_edge_w4_8bpc_neon: 3.82 2.93 2.41 emu_edge_w8_8bpc_neon: 3.28 2.86 2.51 emu_edge_w16_8bpc_neon: 3.58 3.27 2.63 emu_edge_w32_8bpc_neon: 3.04 1.68 2.12 emu_edge_w64_8bpc_neon: 2.58 1.45 1.48 emu_edge_w128_8bpc_neon: 1.79 1.02 1.57 The benchmark numbers for the larger size on A72 fluctuate a whole lot and thus seem very unreliable. --- src/arm/64/mc.S | 158 +++++++++++++++++++++++++++++++++++++++++++++++++ src/arm/mc_init_tmpl.c | 5 ++ 2 files changed, 163 insertions(+) (limited to 'src/arm') diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index 92aa8aa..f6970de 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -3089,3 +3089,161 @@ endfunc warp , 11 warp t, 7 + +// void dav1d_emu_edge_8bpc_neon( +// const intptr_t bw, const intptr_t bh, +// const intptr_t iw, const intptr_t ih, +// const intptr_t x, const intptr_t y, +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *ref, const ptrdiff_t ref_stride) +function emu_edge_8bpc_neon, export=1 + ldp x8, x9, [sp] + + // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + // ref += iclip(x, 0, iw - 1) + sub x12, x3, #1 // ih - 1 + cmp x5, x3 + sub x13, x2, #1 // iw - 1 + csel x12, x12, x5, ge // min(y, ih - 1) + cmp x4, x2 + bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) + csel x13, x13, x4, ge // min(x, iw - 1) + bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) + madd x8, x12, x9, x8 // ref += iclip() * stride + add x8, x8, x13 // ref += iclip() + + // bottom_ext = iclip(y + bh - ih, 0, bh - 1) + // top_ext = iclip(-y, 0, bh - 1) + add x10, x5, x1 // y + bh + neg x5, x5 // -y + sub x10, x10, x3 // y + bh - ih + sub x12, x1, #1 // bh - 1 + cmp x10, x1 + bic x5, x5, x5, asr #63 // max(-y, 0) + csel x10, x10, x12, lt // min(y + bh - ih, bh-1) + cmp x5, x1 + bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) + csel x5, x5, x12, lt // min(max(-y, 0), bh-1) + + // right_ext = iclip(x + bw - iw, 0, bw - 1) + // left_ext = iclip(-x, 0, bw - 1) + add x11, x4, x0 // x + bw + neg x4, x4 // -x + sub x11, x11, x2 // x + bw - iw + sub x13, x0, #1 // bw - 1 + cmp x11, x0 + bic x4, x4, x4, asr #63 // max(-x, 0) + csel x11, x11, x13, lt // min(x + bw - iw, bw-1) + cmp x4, x0 + bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) + csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) + + // center_h = bh - top_ext - bottom_ext + // dst += top_ext * PXSTRIDE(dst_stride) + // center_w = bw - left_ext - right_ext + sub x1, x1, x5 // bh - top_ext + madd x6, x5, x7, x6 + sub x2, x0, x4 // bw - left_ext + sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext + sub x2, x2, x11 // center_w = bw - left_ext - right_ext + + mov x14, x6 // backup of dst + +.macro v_loop need_left, need_right +0: +.if \need_left + ld1r {v0.16b}, [x8] + mov x12, x6 // out = dst + mov x3, x4 +1: + subs x3, x3, #16 + st1 {v0.16b}, [x12], #16 + b.gt 1b +.endif + mov x13, x8 + add x12, x6, x4 // out = dst + left_ext + mov x3, x2 +1: + ld1 {v0.16b, v1.16b}, [x13], #32 + subs x3, x3, #32 + st1 {v0.16b, v1.16b}, [x12], #32 + b.gt 1b +.if \need_right + add x3, x8, x2 // in + center_w + sub x3, x3, #1 // in + center_w - 1 + add x12, x6, x4 // dst + left_ext + ld1r {v0.16b}, [x3] + add x12, x12, x2 // out = dst + left_ext + center_w + mov x3, x11 +1: + subs x3, x3, #16 + st1 {v0.16b}, [x12], #16 + b.gt 1b +.endif + + subs x1, x1, #1 // center_h-- + add x6, x6, x7 + add x8, x8, x9 + b.gt 0b +.endm + + cbz x4, 2f + // need_left + cbz x11, 3f + // need_left + need_right + v_loop 1, 1 + b 5f + +2: + // !need_left + cbz x11, 4f + // !need_left + need_right + v_loop 0, 1 + b 5f + +3: + // need_left + !need_right + v_loop 1, 0 + b 5f + +4: + // !need_left + !need_right + v_loop 0, 0 + +5: + + cbz x10, 3f + // need_bottom + sub x8, x6, x7 // ref = dst - stride + mov x4, x0 +1: + ld1 {v0.16b, v1.16b}, [x8], #32 + mov x3, x10 +2: + subs x3, x3, #1 + st1 {v0.16b, v1.16b}, [x6], x7 + b.gt 2b + msub x6, x7, x10, x6 // dst -= bottom_ext * stride + subs x4, x4, #32 // bw -= 32 + add x6, x6, #32 // dst += 32 + b.gt 1b + +3: + cbz x5, 3f + // need_top + msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride +1: + ld1 {v0.16b, v1.16b}, [x14], #32 + mov x3, x5 +2: + subs x3, x3, #1 + st1 {v0.16b, v1.16b}, [x6], x7 + b.gt 2b + msub x6, x7, x5, x6 // dst -= top_ext * stride + subs x0, x0, #32 // bw -= 32 + add x6, x6, #32 // dst += 32 + b.gt 1b + +3: + ret +endfunc diff --git a/src/arm/mc_init_tmpl.c b/src/arm/mc_init_tmpl.c index b17b781..f9cb8bc 100644 --- a/src/arm/mc_init_tmpl.c +++ b/src/arm/mc_init_tmpl.c @@ -66,6 +66,8 @@ decl_w_mask_fn(BF(dav1d_w_mask_420, neon)); decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, neon)); decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon)); +decl_emu_edge_fn(BF(dav1d_emu_edge, neon)); + void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) { #define init_mc_fn(type, name, suffix) \ c->mc[type] = BF(dav1d_put_##name, suffix) @@ -110,4 +112,7 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) { c->warp8x8 = BF(dav1d_warp_affine_8x8, neon); c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon); #endif +#if BITDEPTH == 8 && ARCH_AARCH64 + c->emu_edge = BF(dav1d_emu_edge, neon); +#endif } -- cgit v1.2.3