diff options
author | Martin Storsjö <martin@martin.st> | 2020-04-04 23:49:51 +0300 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2020-04-05 22:31:39 +0300 |
commit | e03f2d062a845392d547e9cef4fa2e54f660a17d (patch) | |
tree | ca9562679ed09c54edbd262b02dc78085bc8899a /src/arm | |
parent | ea54dbe2a89d3eb4edbdbbf1810180984467c6aa (diff) |
arm64: mc: NEON implementation of emu_edge for 16bpc
Relative speedup over C code:
Cortex A53 A72 A73
emu_edge_w4_16bpc_neon: 2.49 1.53 1.91
emu_edge_w8_16bpc_neon: 2.27 1.55 1.90
emu_edge_w16_16bpc_neon: 2.46 1.46 2.09
emu_edge_w32_16bpc_neon: 2.20 1.39 1.73
emu_edge_w64_16bpc_neon: 1.65 1.00 1.46
emu_edge_w128_16bpc_neon: 1.55 1.44 1.54
Diffstat (limited to 'src/arm')
-rw-r--r-- | src/arm/64/mc16.S | 160 | ||||
-rw-r--r-- | src/arm/mc_init_tmpl.c | 2 |
2 files changed, 161 insertions, 1 deletions
diff --git a/src/arm/64/mc16.S b/src/arm/64/mc16.S index 5fbc398..7ac1863 100644 --- a/src/arm/64/mc16.S +++ b/src/arm/64/mc16.S @@ -3407,3 +3407,163 @@ endfunc warp warp t + +// void dav1d_emu_edge_16bpc_neon( +// const intptr_t bw, const intptr_t bh, +// const intptr_t iw, const intptr_t ih, +// const intptr_t x, const intptr_t y, +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *ref, const ptrdiff_t ref_stride) +function emu_edge_16bpc_neon, export=1 + ldp x8, x9, [sp] + + // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + // ref += iclip(x, 0, iw - 1) + sub x12, x3, #1 // ih - 1 + cmp x5, x3 + sub x13, x2, #1 // iw - 1 + csel x12, x12, x5, ge // min(y, ih - 1) + cmp x4, x2 + bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) + csel x13, x13, x4, ge // min(x, iw - 1) + bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) + madd x8, x12, x9, x8 // ref += iclip() * stride + add x8, x8, x13, lsl #1 // ref += iclip() + + // bottom_ext = iclip(y + bh - ih, 0, bh - 1) + // top_ext = iclip(-y, 0, bh - 1) + add x10, x5, x1 // y + bh + neg x5, x5 // -y + sub x10, x10, x3 // y + bh - ih + sub x12, x1, #1 // bh - 1 + cmp x10, x1 + bic x5, x5, x5, asr #63 // max(-y, 0) + csel x10, x10, x12, lt // min(y + bh - ih, bh-1) + cmp x5, x1 + bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) + csel x5, x5, x12, lt // min(max(-y, 0), bh-1) + + // right_ext = iclip(x + bw - iw, 0, bw - 1) + // left_ext = iclip(-x, 0, bw - 1) + add x11, x4, x0 // x + bw + neg x4, x4 // -x + sub x11, x11, x2 // x + bw - iw + sub x13, x0, #1 // bw - 1 + cmp x11, x0 + bic x4, x4, x4, asr #63 // max(-x, 0) + csel x11, x11, x13, lt // min(x + bw - iw, bw-1) + cmp x4, x0 + bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) + csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) + + // center_h = bh - top_ext - bottom_ext + // dst += top_ext * PXSTRIDE(dst_stride) + // center_w = bw - left_ext - right_ext + sub x1, x1, x5 // bh - top_ext + madd x6, x5, x7, x6 + sub x2, x0, x4 // bw - left_ext + sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext + sub x2, x2, x11 // center_w = bw - left_ext - right_ext + + mov x14, x6 // backup of dst + +.macro v_loop need_left, need_right +0: +.if \need_left + ld1r {v0.8h}, [x8] + mov x12, x6 // out = dst + mov x3, x4 + mov v1.16b, v0.16b +1: + subs x3, x3, #16 + st1 {v0.8h, v1.8h}, [x12], #32 + b.gt 1b +.endif + mov x13, x8 + add x12, x6, x4, lsl #1 // out = dst + left_ext + mov x3, x2 +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64 + subs x3, x3, #32 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64 + b.gt 1b +.if \need_right + add x3, x8, x2, lsl #1 // in + center_w + sub x3, x3, #2 // in + center_w - 1 + add x12, x6, x4, lsl #1 // dst + left_ext + ld1r {v0.8h}, [x3] + add x12, x12, x2, lsl #1 // out = dst + left_ext + center_w + mov x3, x11 + mov v1.16b, v0.16b +1: + subs x3, x3, #16 + st1 {v0.8h, v1.8h}, [x12], #32 + b.gt 1b +.endif + + subs x1, x1, #1 // center_h-- + add x6, x6, x7 + add x8, x8, x9 + b.gt 0b +.endm + + cbz x4, 2f + // need_left + cbz x11, 3f + // need_left + need_right + v_loop 1, 1 + b 5f + +2: + // !need_left + cbz x11, 4f + // !need_left + need_right + v_loop 0, 1 + b 5f + +3: + // need_left + !need_right + v_loop 1, 0 + b 5f + +4: + // !need_left + !need_right + v_loop 0, 0 + +5: + + cbz x10, 3f + // need_bottom + sub x8, x6, x7 // ref = dst - stride + mov x4, x0 +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64 + mov x3, x10 +2: + subs x3, x3, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 + b.gt 2b + msub x6, x7, x10, x6 // dst -= bottom_ext * stride + subs x4, x4, #32 // bw -= 32 + add x6, x6, #64 // dst += 32 + b.gt 1b + +3: + cbz x5, 3f + // need_top + msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64 + mov x3, x5 +2: + subs x3, x3, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 + b.gt 2b + msub x6, x7, x5, x6 // dst -= top_ext * stride + subs x0, x0, #32 // bw -= 32 + add x6, x6, #64 // dst += 32 + b.gt 1b + +3: + ret +endfunc diff --git a/src/arm/mc_init_tmpl.c b/src/arm/mc_init_tmpl.c index f9cb8bc..ca87947 100644 --- a/src/arm/mc_init_tmpl.c +++ b/src/arm/mc_init_tmpl.c @@ -112,7 +112,7 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) { c->warp8x8 = BF(dav1d_warp_affine_8x8, neon); c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon); #endif -#if BITDEPTH == 8 && ARCH_AARCH64 +#if ARCH_AARCH64 c->emu_edge = BF(dav1d_emu_edge, neon); #endif } |