From e39a9212ab37a55b346801c77487d8a47b6f9fe2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Fri, 1 Feb 2019 10:08:20 +0200 Subject: aarch64: vp8: Port bilin functions from arm version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cortex A53 A72 A73 vp8_put_bilin4_h_c: 303.8 102.2 161.8 vp8_put_bilin4_h_neon: 100.0 40.9 41.2 vp8_put_bilin4_hv_c: 322.8 201.0 305.9 vp8_put_bilin4_hv_neon: 156.8 72.6 77.0 vp8_put_bilin4_v_c: 304.7 101.7 166.5 vp8_put_bilin4_v_neon: 82.7 41.2 33.0 vp8_put_bilin8_h_c: 1192.7 352.5 623.8 vp8_put_bilin8_h_neon: 213.5 70.2 87.8 vp8_put_bilin8_hv_c: 1098.6 769.2 1041.9 vp8_put_bilin8_hv_neon: 324.0 123.5 146.0 vp8_put_bilin8_v_c: 1193.9 350.4 617.7 vp8_put_bilin8_v_neon: 183.9 60.7 64.7 vp8_put_bilin16_h_c: 2353.1 671.2 1223.3 vp8_put_bilin16_h_neon: 261.9 140.7 145.0 vp8_put_bilin16_hv_c: 2453.2 1470.9 2355.2 vp8_put_bilin16_hv_neon: 383.9 196.0 217.0 vp8_put_bilin16_v_c: 2349.3 669.8 1251.2 vp8_put_bilin16_v_neon: 202.9 110.7 96.2 Signed-off-by: Martin Storsjö --- libavcodec/aarch64/vp8dsp.h | 5 + libavcodec/aarch64/vp8dsp_init_aarch64.c | 32 ++++ libavcodec/aarch64/vp8dsp_neon.S | 292 +++++++++++++++++++++++++++++++ 3 files changed, 329 insertions(+) (limited to 'libavcodec/aarch64') diff --git a/libavcodec/aarch64/vp8dsp.h b/libavcodec/aarch64/vp8dsp.h index 40d0cae266..616252ebc7 100644 --- a/libavcodec/aarch64/vp8dsp.h +++ b/libavcodec/aarch64/vp8dsp.h @@ -67,4 +67,9 @@ VP8_MC(epel ## w ## _h4v6, opt); \ VP8_MC(epel ## w ## _h6v6, opt) +#define VP8_BILIN(w, opt) \ + VP8_MC(bilin ## w ## _h, opt); \ + VP8_MC(bilin ## w ## _v, opt); \ + VP8_MC(bilin ## w ## _hv, opt) + #endif /* AVCODEC_AARCH64_VP8DSP_H */ diff --git a/libavcodec/aarch64/vp8dsp_init_aarch64.c b/libavcodec/aarch64/vp8dsp_init_aarch64.c index 4cb3ae6b2b..723afb4afd 100644 --- a/libavcodec/aarch64/vp8dsp_init_aarch64.c +++ b/libavcodec/aarch64/vp8dsp_init_aarch64.c @@ -36,6 +36,9 @@ VP8_EPEL(16, neon); VP8_EPEL(8, neon); VP8_EPEL(4, neon); +VP8_BILIN(16, neon); +VP8_BILIN(8, neon); +VP8_BILIN(4, neon); av_cold void ff_vp78dsp_init_aarch64(VP8DSPContext *dsp) { @@ -64,6 +67,35 @@ av_cold void ff_vp78dsp_init_aarch64(VP8DSPContext *dsp) dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_neon; dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_neon; dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_neon; + + dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon; + dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_neon; + dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_neon; + dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_neon; + dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_neon; + dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_neon; + + dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon; + dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_neon; + dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_neon; + dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_neon; + dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_neon; + dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_neon; + + dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_neon; + dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_neon; + dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_neon; + dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_neon; + dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_neon; } av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp) diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index 7fe24669f4..604be8a8bf 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -1509,3 +1509,295 @@ function ff_put_vp8_epel4_h4v4_neon, export=1 add sp, sp, #44 ret endfunc + +/* Bilinear MC */ + +function ff_put_vp8_bilin16_h_neon, export=1 + mov w7, #8 + dup v0.8b, w5 + sub w5, w7, w5 + dup v1.8b, w5 +1: + subs w4, w4, #2 + ld1 {v2.8b,v3.8b,v4.8b}, [x2], x3 + ext v5.8b, v3.8b, v4.8b, #1 + ext v4.8b, v2.8b, v3.8b, #1 + umull v16.8h, v2.8b, v1.8b + umlal v16.8h, v4.8b, v0.8b + ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3 + umull v6.8h, v3.8b, v1.8b + umlal v6.8h, v5.8b, v0.8b + ext v21.8b, v19.8b, v20.8b, #1 + ext v20.8b, v18.8b, v19.8b, #1 + umull v22.8h, v18.8b, v1.8b + umlal v22.8h, v20.8b, v0.8b + umull v24.8h, v19.8b, v1.8b + umlal v24.8h, v21.8b, v0.8b + rshrn v4.8b, v16.8h, #3 + rshrn2 v4.16b, v6.8h, #3 + rshrn v6.8b, v22.8h, #3 + rshrn2 v6.16b, v24.8h, #3 + st1 {v4.16b}, [x0], x1 + st1 {v6.16b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin16_v_neon, export=1 + mov w7, #8 + dup v0.16b, w6 + sub w6, w7, w6 + dup v1.16b, w6 + + ld1 {v2.16b}, [x2], x3 +1: + subs w4, w4, #2 + ld1 {v4.16b}, [x2], x3 + umull v6.8h, v2.8b, v1.8b + umlal v6.8h, v4.8b, v0.8b + umull2 v16.8h, v2.16b, v1.16b + umlal2 v16.8h, v4.16b, v0.16b + ld1 {v2.16b}, [x2], x3 + umull v18.8h, v4.8b, v1.8b + umlal v18.8h, v2.8b, v0.8b + umull2 v20.8h, v4.16b, v1.16b + umlal2 v20.8h, v2.16b, v0.16b + rshrn v4.8b, v6.8h, #3 + rshrn2 v4.16b, v16.8h, #3 + rshrn v6.8b, v18.8h, #3 + rshrn2 v6.16b, v20.8h, #3 + st1 {v4.16b}, [x0], x1 + st1 {v6.16b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin16_hv_neon, export=1 + mov w7, #8 + dup v0.8b, w5 // mx + sub w5, w7, w5 + dup v1.8b, w5 + dup v2.16b, w6 // my + sub w6, w7, w6 + dup v3.16b, w6 + + ld1 {v4.8b,v5.8b,v6.8b}, [x2], x3 + + ext v7.8b, v5.8b, v6.8b, #1 + ext v6.8b, v4.8b, v5.8b, #1 + umull v16.8h, v4.8b, v1.8b + umlal v16.8h, v6.8b, v0.8b + umull v18.8h, v5.8b, v1.8b + umlal v18.8h, v7.8b, v0.8b + rshrn v4.8b, v16.8h, #3 + rshrn2 v4.16b, v18.8h, #3 +1: + subs w4, w4, #2 + ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3 + ext v21.8b, v19.8b, v20.8b, #1 + ext v20.8b, v18.8b, v19.8b, #1 + umull v22.8h, v18.8b, v1.8b + umlal v22.8h, v20.8b, v0.8b + ld1 {v26.8b,v27.8b,v28.8b}, [x2], x3 + umull v24.8h, v19.8b, v1.8b + umlal v24.8h, v21.8b, v0.8b + ext v29.8b, v27.8b, v28.8b, #1 + ext v28.8b, v26.8b, v27.8b, #1 + umull v16.8h, v26.8b, v1.8b + umlal v16.8h, v28.8b, v0.8b + umull v18.8h, v27.8b, v1.8b + umlal v18.8h, v29.8b, v0.8b + rshrn v6.8b, v22.8h, #3 + rshrn2 v6.16b, v24.8h, #3 + umull v24.8h, v4.8b, v3.8b + umlal v24.8h, v6.8b, v2.8b + umull2 v30.8h, v4.16b, v3.16b + umlal2 v30.8h, v6.16b, v2.16b + rshrn v4.8b, v16.8h, #3 + rshrn2 v4.16b, v18.8h, #3 + umull v20.8h, v6.8b, v3.8b + umlal v20.8h, v4.8b, v2.8b + umull2 v22.8h, v6.16b, v3.16b + umlal2 v22.8h, v4.16b, v2.16b + rshrn v24.8b, v24.8h, #3 + rshrn2 v24.16b, v30.8h, #3 + st1 {v24.16b}, [x0], x1 + rshrn v20.8b, v20.8h, #3 + rshrn2 v20.16b, v22.8h, #3 + st1 {v20.16b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin8_h_neon, export=1 + mov w7, #8 + dup v0.8b, w5 + sub w5, w7, w5 + dup v1.8b, w5 +1: + subs w4, w4, #2 + ld1 {v2.8b,v3.8b}, [x2], x3 + ext v3.8b, v2.8b, v3.8b, #1 + umull v4.8h, v2.8b, v1.8b + umlal v4.8h, v3.8b, v0.8b + ld1 {v6.8b,v7.8b}, [x2], x3 + ext v7.8b, v6.8b, v7.8b, #1 + umull v16.8h, v6.8b, v1.8b + umlal v16.8h, v7.8b, v0.8b + rshrn v4.8b, v4.8h, #3 + rshrn v16.8b, v16.8h, #3 + st1 {v4.8b}, [x0], x1 + st1 {v16.8b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin8_v_neon, export=1 + mov w7, #8 + dup v0.8b, w6 + sub w6, w7, w6 + dup v1.8b, w6 + + ld1 {v2.8b}, [x2], x3 +1: + subs w4, w4, #2 + ld1 {v3.8b}, [x2], x3 + umull v4.8h, v2.8b, v1.8b + umlal v4.8h, v3.8b, v0.8b + ld1 {v2.8b}, [x2], x3 + umull v6.8h, v3.8b, v1.8b + umlal v6.8h, v2.8b, v0.8b + rshrn v4.8b, v4.8h, #3 + rshrn v6.8b, v6.8h, #3 + st1 {v4.8b}, [x0], x1 + st1 {v6.8b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin8_hv_neon, export=1 + mov w7, #8 + dup v0.8b, w5 // mx + sub w5, w7, w5 + dup v1.8b, w5 + dup v2.8b, w6 // my + sub w6, w7, w6 + dup v3.8b, w6 + + ld1 {v4.8b,v5.8b}, [x2], x3 + ext v5.8b, v4.8b, v5.8b, #1 + umull v18.8h, v4.8b, v1.8b + umlal v18.8h, v5.8b, v0.8b + rshrn v22.8b, v18.8h, #3 +1: + subs w4, w4, #2 + ld1 {v6.8b,v7.8b}, [x2], x3 + ext v7.8b, v6.8b, v7.8b, #1 + umull v16.8h, v6.8b, v1.8b + umlal v16.8h, v7.8b, v0.8b + ld1 {v4.8b,v5.8b}, [x2], x3 + ext v5.8b, v4.8b, v5.8b, #1 + umull v18.8h, v4.8b, v1.8b + umlal v18.8h, v5.8b, v0.8b + rshrn v16.8b, v16.8h, #3 + umull v20.8h, v22.8b, v3.8b + umlal v20.8h, v16.8b, v2.8b + rshrn v22.8b, v18.8h, #3 + umull v24.8h, v16.8b, v3.8b + umlal v24.8h, v22.8b, v2.8b + rshrn v20.8b, v20.8h, #3 + st1 {v20.8b}, [x0], x1 + rshrn v23.8b, v24.8h, #3 + st1 {v23.8b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin4_h_neon, export=1 + mov w7, #8 + dup v0.8b, w5 + sub w5, w7, w5 + dup v1.8b, w5 +1: + subs w4, w4, #2 + ld1 {v2.8b}, [x2], x3 + ext v3.8b, v2.8b, v3.8b, #1 + ld1 {v6.8b}, [x2], x3 + ext v7.8b, v6.8b, v7.8b, #1 + trn1 v2.2s, v2.2s, v6.2s + trn1 v3.2s, v3.2s, v7.2s + umull v4.8h, v2.8b, v1.8b + umlal v4.8h, v3.8b, v0.8b + rshrn v4.8b, v4.8h, #3 + st1 {v4.s}[0], [x0], x1 + st1 {v4.s}[1], [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin4_v_neon, export=1 + mov w7, #8 + dup v0.8b, w6 + sub w6, w7, w6 + dup v1.8b, w6 + + ld1r {v2.2s}, [x2], x3 +1: + ld1r {v3.2s}, [x2] + ld1 {v2.s}[1], [x2], x3 + ld1 {v3.s}[1], [x2], x3 + umull v4.8h, v2.8b, v1.8b + umlal v4.8h, v3.8b, v0.8b + trn2 v2.2s, v3.2s, v2.2s + rshrn v4.8b, v4.8h, #3 + st1 {v4.s}[0], [x0], x1 + st1 {v4.s}[1], [x0], x1 + subs w4, w4, #2 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin4_hv_neon, export=1 + mov w7, #8 + dup v0.8b, w5 // mx + sub w5, w7, w5 + dup v1.8b, w5 + dup v2.8b, w6 // my + sub w6, w7, w6 + dup v3.8b, w6 + + ld1 {v4.8b}, [x2], x3 + ext v5.8b, v4.8b, v4.8b, #1 + umull v18.8h, v4.8b, v1.8b + umlal v18.8h, v5.8b, v0.8b + rshrn v22.8b, v18.8h, #3 +1: + subs w4, w4, #2 + ld1 {v6.8b}, [x2], x3 + ext v7.8b, v6.8b, v6.8b, #1 + ld1 {v4.8b}, [x2], x3 + ext v5.8b, v4.8b, v4.8b, #1 + trn1 v6.2s, v6.2s, v4.2s + trn1 v7.2s, v7.2s, v5.2s + umull v16.8h, v6.8b, v1.8b + umlal v16.8h, v7.8b, v0.8b + rshrn v16.8b, v16.8h, #3 + umull v20.8h, v16.8b, v2.8b + trn1 v22.2s, v22.2s, v16.2s + umlal v20.8h, v22.8b, v3.8b + rev64 v22.2s, v16.2s + rshrn v20.8b, v20.8h, #3 + st1 {v20.s}[0], [x0], x1 + st1 {v20.s}[1], [x0], x1 + b.gt 1b + + ret +endfunc -- cgit v1.2.3