diff options
author | B Krishnan Iyer <krishnaniyer97@gmail.com> | 2019-08-02 01:45:52 +0300 |
---|---|---|
committer | B Krishnan Iyer <krishnaniyer97@gmail.com> | 2019-08-15 21:26:48 +0300 |
commit | 3d94fb9aff5d2837c9ee0c13fff3d4e2424623ae (patch) | |
tree | 62f15b92f3e55844b431a1685ab2a87a9dfcc3b0 | |
parent | 1dc2dc7d27bd0075684945b00b3539be429886aa (diff) |
arm64: mc: NEON implementation of w_mask_444/422/420 function
A73 A53
w_mask_420_w4_8bpc_c: 818 1082.9
w_mask_420_w4_8bpc_neon: 79 126.6
w_mask_420_w8_8bpc_c: 2486 3399.8
w_mask_420_w8_8bpc_neon: 200.2 343.7
w_mask_420_w16_8bpc_c: 8022.3 10989.6
w_mask_420_w16_8bpc_neon: 528.1 889
w_mask_420_w32_8bpc_c: 31851.8 42808.6
w_mask_420_w32_8bpc_neon: 2062.5 3380.8
w_mask_420_w64_8bpc_c: 79268.5 102683.9
w_mask_420_w64_8bpc_neon: 5252.9 8575.4
w_mask_420_w128_8bpc_c: 193704.1 255586.5
w_mask_420_w128_8bpc_neon: 14602.3 22167.7
w_mask_422_w4_8bpc_c: 777.3 1038.5
w_mask_422_w4_8bpc_neon: 72.1 112.9
w_mask_422_w8_8bpc_c: 2405.7 3168
w_mask_422_w8_8bpc_neon: 191.9 314.1
w_mask_422_w16_8bpc_c: 7783.7 10543.9
w_mask_422_w16_8bpc_neon: 559.8 835.5
w_mask_422_w32_8bpc_c: 30895.7 41141.2
w_mask_422_w32_8bpc_neon: 2089.7 3187.2
w_mask_422_w64_8bpc_c: 75500.2 98766.3
w_mask_422_w64_8bpc_neon: 5379 8208.2
w_mask_422_w128_8bpc_c: 186967.1 245809.1
w_mask_422_w128_8bpc_neon: 15159.9 21474.5
w_mask_444_w4_8bpc_c: 850.1 1136.6
w_mask_444_w4_8bpc_neon: 66.5 104.7
w_mask_444_w8_8bpc_c: 2373.5 3262.9
w_mask_444_w8_8bpc_neon: 180.5 290.2
w_mask_444_w16_8bpc_c: 7291.6 10590.7
w_mask_444_w16_8bpc_neon: 550.9 809.7
w_mask_444_w32_8bpc_c: 8048.3 10140.8
w_mask_444_w32_8bpc_neon: 2136.2 3095
w_mask_444_w64_8bpc_c: 18055.3 23060
w_mask_444_w64_8bpc_neon: 5522.5 8124.8
w_mask_444_w128_8bpc_c: 42754.3 56072
w_mask_444_w128_8bpc_neon: 15569.5 21531.5
-rw-r--r-- | src/arm/64/mc.S | 222 | ||||
-rw-r--r-- | src/arm/mc_init_tmpl.c | 7 |
2 files changed, 225 insertions, 4 deletions
diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index 439cd26..3df8409 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -234,6 +234,228 @@ bidir_fn w_avg bidir_fn mask +.macro w_mask_fn type +function w_mask_\type\()_8bpc_neon, export=1 + clz w8, w4 + adr x9, L(w_mask_\type\()_tbl) + sub w8, w8, #24 + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + mov w10, #6903 + dup v0.8h, w10 +.if \type == 444 + movi v1.16b, #64 +.elseif \type == 422 + dup v2.8b, w7 + movi v3.8b, #129 + sub v3.8b, v3.8b, v2.8b +.elseif \type == 420 + dup v2.8h, w7 + movi v3.8h, #1, lsl #8 + sub v3.8h, v3.8h, v2.8h +.endif + add x12, x0, x1 + lsl x1, x1, #1 + br x9 +4: + ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) + ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) + subs w5, w5, #4 + sub v16.8h, v6.8h, v4.8h + sub v17.8h, v7.8h, v5.8h + sabd v18.8h, v4.8h, v6.8h + sabd v19.8h, v5.8h, v7.8h + uqsub v18.8h, v0.8h, v18.8h + uqsub v19.8h, v0.8h, v19.8h + ushr v18.8h, v18.8h, #8 + ushr v19.8h, v19.8h, #8 + shl v20.8h, v18.8h, #9 + shl v21.8h, v19.8h, #9 + sqdmulh v20.8h, v20.8h, v16.8h + sqdmulh v21.8h, v21.8h, v17.8h + add v20.8h, v20.8h, v4.8h + add v21.8h, v21.8h, v5.8h + sqrshrun v22.8b, v20.8h, #4 + sqrshrun v23.8b, v21.8h, #4 +.if \type == 444 + xtn v18.8b, v18.8h + xtn2 v18.16b, v19.8h + sub v18.16b, v1.16b, v18.16b + st1 {v18.16b}, [x6], #16 +.elseif \type == 422 + addp v18.8h, v18.8h, v19.8h + xtn v18.8b, v18.8h + uhsub v18.8b, v3.8b, v18.8b + st1 {v18.8b}, [x6], #8 +.elseif \type == 420 + trn1 v24.2d, v18.2d, v19.2d + trn2 v25.2d, v18.2d, v19.2d + add v24.8h, v24.8h, v25.8h + addp v18.8h, v24.8h, v24.8h + sub v18.4h, v3.4h, v18.4h + rshrn v18.8b, v18.8h, #2 + st1 {v18.s}[0], [x6], #4 +.endif + st1 {v22.s}[0], [x0], x1 + st1 {v22.s}[1], [x12], x1 + st1 {v23.s}[0], [x0], x1 + st1 {v23.s}[1], [x12], x1 + b.gt 4b + ret +8: + ld1 {v4.8h, v5.8h}, [x2], #32 + ld1 {v6.8h, v7.8h}, [x3], #32 + subs w5, w5, #2 + sub v16.8h, v6.8h, v4.8h + sub v17.8h, v7.8h, v5.8h + sabd v18.8h, v4.8h, v6.8h + sabd v19.8h, v5.8h, v7.8h + uqsub v18.8h, v0.8h, v18.8h + uqsub v19.8h, v0.8h, v19.8h + ushr v18.8h, v18.8h, #8 + ushr v19.8h, v19.8h, #8 + shl v20.8h, v18.8h, #9 + shl v21.8h, v19.8h, #9 + sqdmulh v20.8h, v20.8h, v16.8h + sqdmulh v21.8h, v21.8h, v17.8h + add v20.8h, v20.8h, v4.8h + add v21.8h, v21.8h, v5.8h + sqrshrun v22.8b, v20.8h, #4 + sqrshrun v23.8b, v21.8h, #4 +.if \type == 444 + xtn v18.8b, v18.8h + xtn2 v18.16b, v19.8h + sub v18.16b, v1.16b, v18.16b + st1 {v18.16b}, [x6], #16 +.elseif \type == 422 + addp v18.8h, v18.8h, v19.8h + xtn v18.8b, v18.8h + uhsub v18.8b, v3.8b, v18.8b + st1 {v18.8b}, [x6], #8 +.elseif \type == 420 + add v18.8h, v18.8h, v19.8h + addp v18.8h, v18.8h, v18.8h + sub v18.4h, v3.4h, v18.4h + rshrn v18.8b, v18.8h, #2 + st1 {v18.s}[0], [x6], #4 +.endif + st1 {v22.8b}, [x0], x1 + st1 {v23.8b}, [x12], x1 + b.gt 8b + ret +1280: +640: +320: +160: + mov w11, w4 + sub x1, x1, w4, uxtw +.if \type == 444 + add x10, x6, w4, uxtw +.elseif \type == 422 + add x10, x6, x11, lsr #1 +.endif + add x9, x3, w4, uxtw #1 + add x7, x2, w4, uxtw #1 +161: + mov w8, w4 +16: + ld1 {v4.8h, v5.8h}, [x2], #32 + ld1 {v6.8h, v7.8h}, [x3], #32 + ld1 {v16.8h, v17.8h}, [x7], #32 + ld1 {v18.8h, v19.8h}, [x9], #32 + subs w8, w8, #16 + sub v6.8h, v6.8h, v4.8h + sub v7.8h, v7.8h, v5.8h + sub v18.8h, v18.8h, v16.8h + sub v19.8h, v19.8h, v17.8h + abs v20.8h, v6.8h + abs v21.8h, v7.8h + abs v22.8h, v18.8h + abs v23.8h, v19.8h + uqsub v20.8h, v0.8h, v20.8h + uqsub v21.8h, v0.8h, v21.8h + uqsub v22.8h, v0.8h, v22.8h + uqsub v23.8h, v0.8h, v23.8h + ushr v20.8h, v20.8h, #8 + ushr v21.8h, v21.8h, #8 + ushr v22.8h, v22.8h, #8 + ushr v23.8h, v23.8h, #8 + shl v24.8h, v20.8h, #9 + shl v25.8h, v21.8h, #9 + shl v26.8h, v22.8h, #9 + shl v27.8h, v23.8h, #9 + sqdmulh v24.8h, v24.8h, v6.8h + sqdmulh v25.8h, v25.8h, v7.8h + sqdmulh v26.8h, v26.8h, v18.8h + sqdmulh v27.8h, v27.8h, v19.8h + add v24.8h, v24.8h, v4.8h + add v25.8h, v25.8h, v5.8h + add v26.8h, v26.8h, v16.8h + add v27.8h, v27.8h, v17.8h + sqrshrun v24.8b, v24.8h, #4 + sqrshrun v25.8b, v25.8h, #4 + sqrshrun v26.8b, v26.8h, #4 + sqrshrun v27.8b, v27.8h, #4 +.if \type == 444 + xtn v20.8b, v20.8h + xtn2 v20.16b, v21.8h + xtn v21.8b, v22.8h + xtn2 v21.16b, v23.8h + sub v20.16b, v1.16b, v20.16b + sub v21.16b, v1.16b, v21.16b + st1 {v20.16b}, [x6], #16 + st1 {v21.16b}, [x10], #16 +.elseif \type == 422 + addp v20.8h, v20.8h, v21.8h + addp v21.8h, v22.8h, v23.8h + xtn v20.8b, v20.8h + xtn v21.8b, v21.8h + uhsub v20.8b, v3.8b, v20.8b + uhsub v21.8b, v3.8b, v21.8b + st1 {v20.8b}, [x6], #8 + st1 {v21.8b}, [x10], #8 +.elseif \type == 420 + add v20.8h, v20.8h, v22.8h + add v21.8h, v21.8h, v23.8h + addp v20.8h, v20.8h, v21.8h + sub v20.8h, v3.8h, v20.8h + rshrn v20.8b, v20.8h, #2 + st1 {v20.8b}, [x6], #8 +.endif + st1 {v24.8b, v25.8b}, [x0], #16 + st1 {v26.8b, v27.8b}, [x12], #16 + b.gt 16b + subs w5, w5, #2 + add x2, x2, w4, uxtw #1 + add x3, x3, w4, uxtw #1 + add x7, x7, w4, uxtw #1 + add x9, x9, w4, uxtw #1 +.if \type == 444 + add x6, x6, w4, uxtw + add x10, x10, w4, uxtw +.elseif \type == 422 + add x6, x6, x11, lsr #1 + add x10, x10, x11, lsr #1 +.endif + add x0, x0, x1 + add x12, x12, x1 + b.gt 161b + ret +L(w_mask_\type\()_tbl): + .hword L(w_mask_\type\()_tbl) - 1280b + .hword L(w_mask_\type\()_tbl) - 640b + .hword L(w_mask_\type\()_tbl) - 320b + .hword L(w_mask_\type\()_tbl) - 160b + .hword L(w_mask_\type\()_tbl) - 8b + .hword L(w_mask_\type\()_tbl) - 4b +endfunc +.endm + +w_mask_fn 444 +w_mask_fn 422 +w_mask_fn 420 + + function blend_8bpc_neon, export=1 adr x6, L(blend_tbl) clz w3, w3 diff --git a/src/arm/mc_init_tmpl.c b/src/arm/mc_init_tmpl.c index 30086a4..ff53b93 100644 --- a/src/arm/mc_init_tmpl.c +++ b/src/arm/mc_init_tmpl.c @@ -104,13 +104,12 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) { c->blend = dav1d_blend_8bpc_neon; c->blend_h = dav1d_blend_h_8bpc_neon; c->blend_v = dav1d_blend_v_8bpc_neon; -#if ARCH_AARCH64 - c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon; - c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon; -#elif ARCH_ARM c->w_mask[0] = dav1d_w_mask_444_8bpc_neon; c->w_mask[1] = dav1d_w_mask_422_8bpc_neon; c->w_mask[2] = dav1d_w_mask_420_8bpc_neon; +#if ARCH_AARCH64 + c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon; + c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon; #endif #endif } |