diff options
-rw-r--r-- | src/arm/64/mc.S | 2101 | ||||
-rw-r--r-- | src/arm/asm.S | 2 | ||||
-rw-r--r-- | src/arm/mc_init_tmpl.c | 50 |
3 files changed, 2153 insertions, 0 deletions
diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index f5578dc..ecdbbcc 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -1,6 +1,7 @@ /* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2018, Janne Grunau + * Copyright © 2018, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -26,6 +27,7 @@ */ #include "src/arm/asm.S" +#include "src/arm/64/util.S" .macro avg dst, t0, t1 ld1 {\t0\().8h}, [x2], 16 @@ -230,3 +232,2102 @@ endfunc bidir_fn avg bidir_fn w_avg bidir_fn mask + + +// This has got the same signature as the put_8tap functions, +// and assumes that x8 is set to (24-clz(w)). +function put + adr x9, L(put_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +2: + ld1 {v0.h}[0], [x2], x3 + ld1 {v1.h}[0], [x2], x3 + subs w5, w5, #2 + st1 {v0.h}[0], [x0], x1 + st1 {v1.h}[0], [x0], x1 + b.gt 2b + ret +4: + ld1 {v0.s}[0], [x2], x3 + ld1 {v1.s}[0], [x2], x3 + subs w5, w5, #2 + st1 {v0.s}[0], [x0], x1 + st1 {v1.s}[0], [x0], x1 + b.gt 4b + ret +8: + ld1 {v0.8b}, [x2], x3 + ld1 {v1.8b}, [x2], x3 + subs w5, w5, #2 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x0], x1 + b.gt 8b + ret +160: + add x8, x0, x1 + lsl x1, x1, #1 + add x9, x2, x3 + lsl x3, x3, #1 +16: + ld1 {v0.16b}, [x2], x3 + ld1 {v1.16b}, [x9], x3 + subs w5, w5, #2 + st1 {v0.16b}, [x0], x1 + st1 {v1.16b}, [x8], x1 + b.gt 16b + ret +32: + ldp x6, x7, [x2] + ldp x8, x9, [x2, #16] + stp x6, x7, [x0] + subs w5, w5, #1 + stp x8, x9, [x0, #16] + add x2, x2, x3 + add x0, x0, x1 + b.gt 32b + ret +64: + ldp x6, x7, [x2] + ldp x8, x9, [x2, #16] + stp x6, x7, [x0] + ldp x10, x11, [x2, #32] + stp x8, x9, [x0, #16] + subs w5, w5, #1 + ldp x12, x13, [x2, #48] + stp x10, x11, [x0, #32] + stp x12, x13, [x0, #48] + add x2, x2, x3 + add x0, x0, x1 + b.gt 64b + ret +128: + ldp q0, q1, [x2] + ldp q2, q3, [x2, #32] + stp q0, q1, [x0] + ldp q4, q5, [x2, #64] + stp q2, q3, [x0, #32] + ldp q6, q7, [x2, #96] + subs w5, w5, #1 + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + add x2, x2, x3 + add x0, x0, x1 + b.gt 128b + ret + +L(put_tbl): + .hword L(put_tbl) - 128b + .hword L(put_tbl) - 64b + .hword L(put_tbl) - 32b + .hword L(put_tbl) - 160b + .hword L(put_tbl) - 8b + .hword L(put_tbl) - 4b + .hword L(put_tbl) - 2b +endfunc + + +// This has got the same signature as the prep_8tap functions, +// and assumes that x8 is set to (24-clz(w)), and x7 to w*2. +function prep + adr x9, L(prep_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +4: + ld1 {v0.s}[0], [x1], x2 + ld1 {v1.s}[0], [x1], x2 + subs w4, w4, #2 + ushll v0.8h, v0.8b, #4 + ushll v1.8h, v1.8b, #4 + st1 {v0.4h, v1.4h}, [x0], #16 + b.gt 4b + ret +8: + ld1 {v0.8b}, [x1], x2 + ld1 {v1.8b}, [x1], x2 + subs w4, w4, #2 + ushll v0.8h, v0.8b, #4 + ushll v1.8h, v1.8b, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + b.gt 8b + ret +160: + add x9, x1, x2 + lsl x2, x2, #1 +16: + ld1 {v0.16b}, [x1], x2 + ld1 {v1.16b}, [x9], x2 + subs w4, w4, #2 + ushll v4.8h, v0.8b, #4 + ushll2 v5.8h, v0.16b, #4 + ushll v6.8h, v1.8b, #4 + ushll2 v7.8h, v1.16b, #4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + b.gt 16b + ret +320: + add x8, x0, w3, uxtw +32: + ld1 {v0.16b, v1.16b}, [x1], x2 + subs w4, w4, #2 + ushll v4.8h, v0.8b, #4 + ushll2 v5.8h, v0.16b, #4 + ld1 {v2.16b, v3.16b}, [x1], x2 + ushll v6.8h, v1.8b, #4 + ushll2 v7.8h, v1.16b, #4 + ushll v16.8h, v2.8b, #4 + st1 {v4.8h, v5.8h}, [x0], x7 + ushll2 v17.8h, v2.16b, #4 + st1 {v6.8h, v7.8h}, [x8], x7 + ushll v18.8h, v3.8b, #4 + st1 {v16.8h, v17.8h}, [x0], x7 + ushll2 v19.8h, v3.16b, #4 + st1 {v18.8h, v19.8h}, [x8], x7 + b.gt 32b + ret +640: + add x8, x0, #32 + mov x6, #64 +64: + ldp q0, q1, [x1] + subs w4, w4, #1 + ushll v4.8h, v0.8b, #4 + ushll2 v5.8h, v0.16b, #4 + ldp q2, q3, [x1, #32] + ushll v6.8h, v1.8b, #4 + ushll2 v7.8h, v1.16b, #4 + add x1, x1, x2 + ushll v16.8h, v2.8b, #4 + st1 {v4.8h, v5.8h}, [x0], x6 + ushll2 v17.8h, v2.16b, #4 + ushll v18.8h, v3.8b, #4 + st1 {v6.8h, v7.8h}, [x8], x6 + ushll2 v19.8h, v3.16b, #4 + st1 {v16.8h, v17.8h}, [x0], x6 + st1 {v18.8h, v19.8h}, [x8], x6 + b.gt 64b + ret +1280: + add x8, x0, #64 + mov x6, #128 +128: + ldp q0, q1, [x1] + ldp q2, q3, [x1, #32] + ushll v16.8h, v0.8b, #4 + ushll2 v17.8h, v0.16b, #4 + ushll v18.8h, v1.8b, #4 + ushll2 v19.8h, v1.16b, #4 + ushll v20.8h, v2.8b, #4 + ushll2 v21.8h, v2.16b, #4 + ldp q4, q5, [x1, #64] + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6 + ushll v22.8h, v3.8b, #4 + ushll2 v23.8h, v3.16b, #4 + ushll v24.8h, v4.8b, #4 + ushll2 v25.8h, v4.16b, #4 + ushll v26.8h, v5.8b, #4 + ushll2 v27.8h, v5.16b, #4 + ldp q6, q7, [x1, #96] + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6 + ushll v28.8h, v6.8b, #4 + ushll2 v29.8h, v6.16b, #4 + ushll v30.8h, v7.8b, #4 + ushll2 v31.8h, v7.16b, #4 + subs w4, w4, #1 + add x1, x1, x2 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6 + b.gt 128b + ret + +L(prep_tbl): + .hword L(prep_tbl) - 1280b + .hword L(prep_tbl) - 640b + .hword L(prep_tbl) - 320b + .hword L(prep_tbl) - 160b + .hword L(prep_tbl) - 8b + .hword L(prep_tbl) - 4b +endfunc + + +.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 + ld1 {\d0\wd}[0], [\s0], \strd + ld1 {\d1\wd}[0], [\s1], \strd +.ifnb \d2 + ld1 {\d2\wd}[0], [\s0], \strd + ld1 {\d3\wd}[0], [\s1], \strd +.endif +.ifnb \d4 + ld1 {\d4\wd}[0], [\s0], \strd +.endif +.ifnb \d5 + ld1 {\d5\wd}[0], [\s1], \strd +.endif +.ifnb \d6 + ld1 {\d6\wd}[0], [\s0], \strd +.endif +.endm +.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 + ld1 {\d0\wd}, [\s0], \strd + ld1 {\d1\wd}, [\s1], \strd +.ifnb \d2 + ld1 {\d2\wd}, [\s0], \strd + ld1 {\d3\wd}, [\s1], \strd +.endif +.ifnb \d4 + ld1 {\d4\wd}, [\s0], \strd +.endif +.ifnb \d5 + ld1 {\d5\wd}, [\s1], \strd +.endif +.ifnb \d6 + ld1 {\d6\wd}, [\s0], \strd +.endif +.endm +.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_slice \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_reg \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 + load_reg \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 +.endm +.macro interleave_1 wd, r0, r1, r2, r3, r4 + trn1 \r0\wd, \r0\wd, \r1\wd + trn1 \r1\wd, \r1\wd, \r2\wd +.ifnb \r3 + trn1 \r2\wd, \r2\wd, \r3\wd + trn1 \r3\wd, \r3\wd, \r4\wd +.endif +.endm +.macro interleave_1_h r0, r1, r2, r3, r4 + interleave_1 .4h, \r0, \r1, \r2, \r3, \r4 +.endm +.macro interleave_1_s r0, r1, r2, r3, r4 + interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 +.endm +.macro interleave_2 wd, r0, r1, r2, r3, r4, r5 + trn1 \r0\wd, \r0\wd, \r2\wd + trn1 \r1\wd, \r1\wd, \r3\wd + trn1 \r2\wd, \r2\wd, \r4\wd + trn1 \r3\wd, \r3\wd, \r5\wd +.endm +.macro interleave_2_s r0, r1, r2, r3, r4, r5 + interleave_2 .2s, \r0, \r1, \r2, \r3, \r4, \r5 +.endm +.macro uxtl_b r0, r1, r2, r3, r4, r5, r6 + uxtl \r0\().8h, \r0\().8b + uxtl \r1\().8h, \r1\().8b +.ifnb \r2 + uxtl \r2\().8h, \r2\().8b + uxtl \r3\().8h, \r3\().8b +.endif +.ifnb \r4 + uxtl \r4\().8h, \r4\().8b +.endif +.ifnb \r5 + uxtl \r5\().8h, \r5\().8b +.endif +.ifnb \r6 + uxtl \r6\().8h, \r6\().8b +.endif +.endm +.macro mul_mla_4 d, s0, s1, s2, s3, wd + mul \d\wd, \s0\wd, v0.h[0] + mla \d\wd, \s1\wd, v0.h[1] + mla \d\wd, \s2\wd, v0.h[2] + mla \d\wd, \s3\wd, v0.h[3] +.endm +.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 + mul \d0\().8h, \s0\().8h, v0.h[0] + mul \d1\().8h, \s1\().8h, v0.h[0] + mla \d0\().8h, \s1\().8h, v0.h[1] + mla \d1\().8h, \s2\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d1\().8h, \s3\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d1\().8h, \s4\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d1\().8h, \s5\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d1\().8h, \s6\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] + mla \d1\().8h, \s7\().8h, v0.h[6] + mla \d0\().8h, \s7\().8h, v0.h[7] + mla \d1\().8h, \s8\().8h, v0.h[7] +.endm +.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 + mul \d0\().8h, \s0\().8h, v0.h[0] + mul \d1\().8h, \s2\().8h, v0.h[0] + mla \d0\().8h, \s1\().8h, v0.h[1] + mla \d1\().8h, \s3\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d1\().8h, \s4\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d1\().8h, \s5\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d1\().8h, \s6\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d1\().8h, \s7\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] + mla \d1\().8h, \s8\().8h, v0.h[6] + mla \d0\().8h, \s7\().8h, v0.h[7] + mla \d1\().8h, \s9\().8h, v0.h[7] +.endm +.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11 + mul \d0\().8h, \s0\().8h, v0.h[0] + mul \d1\().8h, \s4\().8h, v0.h[0] + mla \d0\().8h, \s1\().8h, v0.h[1] + mla \d1\().8h, \s5\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] + mla \d1\().8h, \s6\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] + mla \d1\().8h, \s7\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] + mla \d1\().8h, \s8\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] + mla \d1\().8h, \s9\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] + mla \d1\().8h, \s10\().8h, v0.h[6] + mla \d0\().8h, \s7\().8h, v0.h[7] + mla \d1\().8h, \s11\().8h, v0.h[7] +.endm +.macro sqrshrun_b shift, r0, r1, r2, r3 + sqrshrun \r0\().8b, \r0\().8h, #\shift +.ifnb \r1 + sqrshrun \r1\().8b, \r1\().8h, #\shift +.endif +.ifnb \r2 + sqrshrun \r2\().8b, \r2\().8h, #\shift + sqrshrun \r3\().8b, \r3\().8h, #\shift +.endif +.endm +.macro srshr_h shift, r0, r1, r2, r3 + srshr \r0\().8h, \r0\().8h, #\shift +.ifnb \r1 + srshr \r1\().8h, \r1\().8h, #\shift +.endif +.ifnb \r2 + srshr \r2\().8h, \r2\().8h, #\shift + srshr \r3\().8h, \r3\().8h, #\shift +.endif +.endm +.macro st_h strd, reg, lanes + st1 {\reg\().h}[0], [x0], \strd + st1 {\reg\().h}[1], [x8], \strd +.if \lanes > 2 + st1 {\reg\().h}[2], [x0], \strd + st1 {\reg\().h}[3], [x8], \strd +.endif +.endm +.macro st_s strd, r0, r1, r2, r3 + st1 {\r0\().s}[0], [x0], \strd + st1 {\r0\().s}[1], [x8], \strd +.ifnb \r1 + st1 {\r1\().s}[0], [x0], \strd + st1 {\r1\().s}[1], [x8], \strd +.endif +.endm +.macro st_d strd, r0, r1, r2, r3 + st1 {\r0\().d}[0], [x0], \strd + st1 {\r0\().d}[1], [x8], \strd +.ifnb \r1 + st1 {\r1\().d}[0], [x0], \strd + st1 {\r1\().d}[1], [x8], \strd +.endif +.endm +.macro shift_store_4 type, strd, r0, r1, r2, r3 +.ifc \type, put + sqrshrun_b 6, \r0, \r1, \r2, \r3 + st_s \strd, \r0, \r1, \r2, \r3 +.else + srshr_h 2, \r0, \r1, \r2, \r3 + st_d \strd, \r0, \r1, \r2, \r3 +.endif +.endm +.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 + st1 {\r0\wd}, [x0], \strd + st1 {\r1\wd}, [x8], \strd +.ifnb \r2 + st1 {\r2\wd}, [x0], \strd + st1 {\r3\wd}, [x8], \strd +.endif +.ifnb \r4 + st1 {\r4\wd}, [x0], \strd + st1 {\r5\wd}, [x8], \strd + st1 {\r6\wd}, [x0], \strd + st1 {\r7\wd}, [x8], \strd +.endif +.endm +.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7 + st_reg \strd, .8b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endm +.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7 + st_reg \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endm +.macro shift_store_8 type, strd, r0, r1, r2, r3 +.ifc \type, put + sqrshrun_b 6, \r0, \r1, \r2, \r3 + st_8b \strd, \r0, \r1, \r2, \r3 +.else + srshr_h 2, \r0, \r1, \r2, \r3 + st_16b \strd, \r0, \r1, \r2, \r3 +.endif +.endm +.macro shift_store_16 type, strd, r0, r1, r2, r3 +.ifc \type, put + sqrshrun \r0\().8b, \r0\().8h, #6 + sqrshrun2 \r0\().16b, \r1\().8h, #6 + sqrshrun \r2\().8b, \r2\().8h, #6 + sqrshrun2 \r2\().16b, \r3\().8h, #6 + st_16b \strd, \r0, \r2 +.else + srshr_h 2, \r0, \r1, \r2, \r3 + st1 {\r0\().8h, \r1\().8h}, [x0], \strd + st1 {\r2\().8h, \r3\().8h}, [x8], \strd +.endif +.endm + +.macro make_8tap_fn op, type, type_h, type_v +function \op\()_8tap_\type\()_8bpc_neon, export=1 + mov x8, \type_h + mov x9, \type_v + b \op\()_8tap +endfunc +.endm + +// No spaces in these expressions, due to gas-preprocessor. +#define REGULAR ((0*15<<7)|3*15) +#define SMOOTH ((1*15<<7)|4*15) +#define SHARP ((2*15<<7)|3*15) + +.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv +make_8tap_fn \type, regular, REGULAR, REGULAR +make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH +make_8tap_fn \type, regular_sharp, REGULAR, SHARP +make_8tap_fn \type, smooth, SMOOTH, SMOOTH +make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR +make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP +make_8tap_fn \type, sharp, SHARP, SHARP +make_8tap_fn \type, sharp_regular, SHARP, REGULAR +make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH + +function \type\()_8tap + mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) + mul \mx, \mx, w10 + mul \my, \my, w10 + add \mx, \mx, w8 // mx, 8tap_h, 4tap_h + add \my, \my, w9 // my, 8tap_v, 4tap_v +.ifc \type, prep + uxtw \d_strd, \w + lsl \d_strd, \d_strd, #1 +.endif + + clz w8, \w + tst \mx, #(0x7f << 14) + sub w8, w8, #24 + movrel x10, X(mc_subpel_filters), -8 + b.ne L(\type\()_8tap_h) + tst \my, #(0x7f << 14) + b.ne L(\type\()_8tap_v) + b \type + +L(\type\()_8tap_h): + cmp \w, #4 + ubfm w9, \mx, #7, #13 + and \mx, \mx, #0x7f + b.le 4f + mov \mx, w9 +4: + tst \my, #(0x7f << 14) + add \xmx, x10, \mx, uxtw #3 + b.ne L(\type\()_8tap_hv) + + adr x9, L(\type\()_8tap_h_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +20: // 2xN h +.ifc \type, put + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + sub \src, \src, #1 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b +2: + ld1 {v4.8b}, [\src], \s_strd + ld1 {v6.8b}, [\sr2], \s_strd + uxtl v4.8h, v4.8b + uxtl v6.8h, v6.8b + ext v5.16b, v4.16b, v4.16b, #2 + ext v7.16b, v6.16b, v6.16b, #2 + subs \h, \h, #2 + trn1 v3.2s, v4.2s, v6.2s + trn2 v6.2s, v4.2s, v6.2s + trn1 v4.2s, v5.2s, v7.2s + trn2 v7.2s, v5.2s, v7.2s + mul v3.4h, v3.4h, v0.h[0] + mla v3.4h, v4.4h, v0.h[1] + mla v3.4h, v6.4h, v0.h[2] + mla v3.4h, v7.4h, v0.h[3] + srshr v3.4h, v3.4h, #2 + sqrshrun v3.8b, v3.8h, #4 + st1 {v3.h}[0], [\dst], \d_strd + st1 {v3.h}[1], [\ds2], \d_strd + b.gt 2b + ret +.endif + +40: // 4xN h + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + sub \src, \src, #1 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b +4: + ld1 {v16.8b}, [\src], \s_strd + ld1 {v20.8b}, [\sr2], \s_strd + uxtl v16.8h, v16.8b + uxtl v20.8h, v20.8b + ext v17.16b, v16.16b, v16.16b, #2 + ext v18.16b, v16.16b, v16.16b, #4 + ext v19.16b, v16.16b, v16.16b, #6 + ext v21.16b, v20.16b, v20.16b, #2 + ext v22.16b, v20.16b, v20.16b, #4 + ext v23.16b, v20.16b, v20.16b, #6 + subs \h, \h, #2 + mul v16.4h, v16.4h, v0.h[0] + mla v16.4h, v17.4h, v0.h[1] + mla v16.4h, v18.4h, v0.h[2] + mla v16.4h, v19.4h, v0.h[3] + mul v20.4h, v20.4h, v0.h[0] + mla v20.4h, v21.4h, v0.h[1] + mla v20.4h, v22.4h, v0.h[2] + mla v20.4h, v23.4h, v0.h[3] + srshr v16.4h, v16.4h, #2 + srshr v20.4h, v20.4h, #2 +.ifc \type, put + sqrshrun v16.8b, v16.8h, #4 + sqrshrun v20.8b, v20.8h, #4 + st1 {v16.s}[0], [\dst], \d_strd + st1 {v20.s}[0], [\ds2], \d_strd +.else + st1 {v16.4h}, [\dst], \d_strd + st1 {v20.4h}, [\ds2], \d_strd +.endif + b.gt 4b + ret + +80: // 8xN h + ld1 {v0.8b}, [\xmx] + sub \src, \src, #3 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b +8: + ld1 {v16.8b, v17.8b}, [\src], \s_strd + ld1 {v20.8b, v21.8b}, [\sr2], \s_strd + uxtl v16.8h, v16.8b + uxtl v17.8h, v17.8b + uxtl v20.8h, v20.8b + uxtl v21.8h, v21.8b + + mul v18.8h, v16.8h, v0.h[0] + mul v22.8h, v20.8h, v0.h[0] +.irpc i, 1234567 + ext v19.16b, v16.16b, v17.16b, #(2*\i) + ext v23.16b, v20.16b, v21.16b, #(2*\i) + mla v18.8h, v19.8h, v0.h[\i] + mla v22.8h, v23.8h, v0.h[\i] +.endr + subs \h, \h, #2 + srshr v18.8h, v18.8h, #2 + srshr v22.8h, v22.8h, #2 +.ifc \type, put + sqrshrun v18.8b, v18.8h, #4 + sqrshrun v22.8b, v22.8h, #4 + st1 {v18.8b}, [\dst], \d_strd + st1 {v22.8b}, [\ds2], \d_strd +.else + st1 {v18.8h}, [\dst], \d_strd + st1 {v22.8h}, [\ds2], \d_strd +.endif + b.gt 8b + ret +160: +320: +640: +1280: // 16xN, 32xN, ... h + ld1 {v0.8b}, [\xmx] + sub \src, \src, #3 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b + + sub \s_strd, \s_strd, \w, uxtw + sub \s_strd, \s_strd, #8 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w, uxtw +.endif +161: + ld1 {v16.8b, v17.8b, v18.8b}, [\src], #24 + ld1 {v20.8b, v21.8b, v22.8b}, [\sr2], #24 + mov \mx, \w + uxtl v16.8h, v16.8b + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v20.8h, v20.8b + uxtl v21.8h, v21.8b + uxtl v22.8h, v22.8b + +16: + mul v24.8h, v16.8h, v0.h[0] + mul v25.8h, v17.8h, v0.h[0] + mul v26.8h, v20.8h, v0.h[0] + mul v27.8h, v21.8h, v0.h[0] +.irpc i, 1234567 + ext v28.16b, v16.16b, v17.16b, #(2*\i) + ext v29.16b, v17.16b, v18.16b, #(2*\i) + ext v30.16b, v20.16b, v21.16b, #(2*\i) + ext v31.16b, v21.16b, v22.16b, #(2*\i) + mla v24.8h, v28.8h, v0.h[\i] + mla v25.8h, v29.8h, v0.h[\i] + mla v26.8h, v30.8h, v0.h[\i] + mla v27.8h, v31.8h, v0.h[\i] +.endr + srshr v24.8h, v24.8h, #2 + srshr v25.8h, v25.8h, #2 + srshr v26.8h, v26.8h, #2 + srshr v27.8h, v27.8h, #2 + subs \mx, \mx, #16 +.ifc \type, put + sqrshrun v24.8b, v24.8h, #4 + sqrshrun2 v24.16b, v25.8h, #4 + sqrshrun v26.8b, v26.8h, #4 + sqrshrun2 v26.16b, v27.8h, #4 + st1 {v24.16b}, [\dst], #16 + st1 {v26.16b}, [\ds2], #16 +.else + st1 {v24.8h, v25.8h}, [\dst], #32 + st1 {v26.8h, v27.8h}, [\ds2], #32 +.endif + b.le 9f + + mov v16.16b, v18.16b + mov v20.16b, v22.16b + ld1 {v17.8b, v18.8b}, [\src], #16 + ld1 {v21.8b, v22.8b}, [\sr2], #16 + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v21.8h, v21.8b + uxtl v22.8h, v22.8b + b 16b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + b.gt 161b + ret + +L(\type\()_8tap_h_tbl): + .hword L(\type\()_8tap_h_tbl) - 1280b + .hword L(\type\()_8tap_h_tbl) - 640b + .hword L(\type\()_8tap_h_tbl) - 320b + .hword L(\type\()_8tap_h_tbl) - 160b + .hword L(\type\()_8tap_h_tbl) - 80b + .hword L(\type\()_8tap_h_tbl) - 40b + .hword L(\type\()_8tap_h_tbl) - 20b + .hword 0 + + +L(\type\()_8tap_v): + cmp \h, #4 + ubfm w9, \my, #7, #13 + and \my, \my, #0x7f + b.le 4f + mov \my, w9 +4: + add \xmy, x10, \my, uxtw #3 + + adr x9, L(\type\()_8tap_v_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +20: // 2xN v +.ifc \type, put + b.gt 28f + + cmp \h, #2 + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + // 2x2 v + load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 + interleave_1_h v1, v2, v3, v4, v5 + b.gt 24f + uxtl_b v1, v2, v3, v4 + mul_mla_4 v6, v1, v2, v3, v4, .4h + sqrshrun_b 6, v6 + st_h \d_strd, v6, 2 + ret + +24: // 2x4 v + load_h \sr2, \src, \s_strd, v6, v7 + interleave_1_h v5, v6, v7 + interleave_2_s v1, v2, v3, v4, v5, v6 + uxtl_b v1, v2, v3, v4 + mul_mla_4 v6, v1, v2, v3, v4, .8h + sqrshrun_b 6, v6 + st_h \d_strd, v6, 4 + ret + +28: // 2x8, 2x16 v + ld1 {v0.8b}, [\xmy] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + sxtl v0.8h, v0.8b + + load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 + interleave_1_h v1, v2, v3, v4, v5 + interleave_1_h v5, v6, v7 + interleave_2_s v1, v2, v3, v4, v5, v6 + uxtl_b v1, v2, v3, v4 +216: + subs \h, \h, #8 + load_h \sr2, \src, \s_strd, v16, v17, v18, v19 + load_h \sr2, \src, \s_strd, v20, v21, v22, v23 + interleave_1_h v7, v16, v17, v18, v19 + interleave_1_h v19, v20, v21, v22, v23 + interleave_2_s v5, v6, v7, v16, v17, v18 + interleave_2_s v17, v18, v19, v20, v21, v22 + uxtl_b v5, v6, v7, v16 + uxtl_b v17, v18, v19, v20 + mul_mla_8_4 v30, v31, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v20 + sqrshrun_b 6, v30, v31 + st_h \d_strd, v30, 4 + st_h \d_strd, v31, 4 + b.le 0f + mov v1.16b, v17.16b + mov v2.16b, v18.16b + mov v3.16b, v19.16b + mov v4.16b, v20.16b + mov v5.16b, v21.16b + mov v6.16b, v22.16b + mov v7.16b, v23.16b + b 216b +0: + ret +.endif + +40: + b.gt 480f + + // 4x2, 4x4 v + cmp \h, #2 + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 + interleave_1_s v1, v2, v3, v4, v5 + uxtl_b v1, v2, v3, v4 + mul_mla_4 v6, v1, v2, v3, v4, .8h + shift_store_4 \type, \d_strd, v6 + b.le 0f + load_s \sr2, \src, \s_strd, v6, v7 + interleave_1_s v5, v6, v7 + uxtl_b v5, v6 + mul_mla_4 v7, v3, v4, v5, v6, .8h + shift_store_4 \type, \d_strd, v7 +0: + ret + +480: // 4x8, 4x16 v + ld1 {v0.8b}, [\xmy] + sub \sr2, \src, \s_strd, lsl #1 + add \ds2, \dst, \d_strd + sub \src, \sr2, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + load_s \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 + interleave_1_s v16, v17, v18 + interleave_1_s v18, v19, v20, v21, v22 + uxtl_b v16, v17 + uxtl_b v18, v19, v20, v21 + +48: + subs \h, \h, #4 + load_s \sr2, \src, \s_strd, v23, v24, v25, v26 + interleave_1_s v22, v23, v24, v25, v26 + uxtl_b v22, v23, v24, v25 + mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 + shift_store_4 \type, \d_strd, v1, v2 + b.le 0f + subs \h, \h, #4 + load_s \sr2, \src, \s_strd, v27, v16, v17, v18 + interleave_1_s v26, v27, v16, v17, v18 + uxtl_b v26, v27, v16, v17 + mul_mla_8_2 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16, v17 + shift_store_4 \type, \d_strd, v1, v2 + b.le 0f + subs \h, \h, #4 + load_s \sr2, \src, \s_strd, v19, v20, v21, v22 + interleave_1_s v18, v19, v20, v21, v22 + uxtl_b v18, v19, v20, v21 + mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 + shift_store_4 \type, \d_strd, v1, v2 + b 48b +0: + ret + +80: + b.gt 880f + + // 8x2, 8x4 v + cmp \h, #2 + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 + uxtl_b v1, v2, v3, v4, v5 + mul_mla_4 v6, v1, v2, v3, v4, .8h + mul_mla_4 v7, v2, v3, v4, v5, .8h + shift_store_8 \type, \d_strd, v6, v7 + b.le 0f + load_8b \sr2, \src, \s_strd, v6, v7 + uxtl_b v6, v7 + mul_mla_4 v1, v3, v4, v5, v6, .8h + mul_mla_4 v2, v4, v5, v6, v7, .8h + shift_store_8 \type, \d_strd, v1, v2 +0: + ret + +880: // 8x8, 8x16, 8x32 v +1680: // 16x8, 16x16, ... +320: // 32x8, 32x16, ... +640: +1280: + ld1 {v0.8b}, [\xmy] + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + sxtl v0.8h, v0.8b + mov \my, \h +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + load_8b \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 + uxtl_b v16, v17, v18, v19, v20, v21, v22 + +88: + subs \h, \h, #2 + load_8b \sr2, \src, \s_strd, v23, v24 + uxtl_b v23, v24 + mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24 + shift_store_8 \type, \d_strd, v1, v2 + b.le 9f + subs \h, \h, #2 + load_8b \sr2, \src, \s_strd, v25, v26 + uxtl_b v25, v26 + mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 + shift_store_8 \type, \d_strd, v3, v4 + b.le 9f + subs \h, \h, #4 + load_8b \sr2, \src, \s_strd, v27, v16, v17, v18 + uxtl_b v27, v16, v17, v18 + mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 + mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 + shift_store_8 \type, \d_strd, v1, v2, v3, v4 + b.le 9f + subs \h, \h, #4 + load_8b \sr2, \src, \s_strd, v19, v20, v21, v22 + uxtl_b v19, v20, v21, v22 + mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20 + mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22 + shift_store_8 \type, \d_strd, v1, v2, v3, v4 + b.gt 88b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 168b +0: + ret + +160: + b.gt 1680b + + // 16x4 v + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + + cmp \h, #2 + load_16b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 + uxtl v16.8h, v1.8b + uxtl v17.8h, v2.8b + uxtl v18.8h, v3.8b + uxtl v19.8h, v4.8b + uxtl v20.8h, v5.8b + uxtl2 v23.8h, v1.16b + uxtl2 v24.8h, v2.16b + uxtl2 v25.8h, v3.16b + uxtl2 v26.8h, v4.16b + uxtl2 v27.8h, v5.16b + mul_mla_4 v1, v16, v17, v18, v19, .8h + mul_mla_4 v16, v17, v18, v19, v20, .8h + mul_mla_4 v2, v23, v24, v25, v26, .8h + mul_mla_4 v17, v24, v25, v26, v27, .8h + shift_store_16 \type, \d_strd, v1, v2, v16, v17 + b.le 0f + load_16b \sr2, \src, \s_strd, v6, v7 + uxtl v21.8h, v6.8b + uxtl v22.8h, v7.8b + uxtl2 v28.8h, v6.16b + uxtl2 v29.8h, v7.16b + mul_mla_4 v1, v18, v19, v20, v21, .8h + mul_mla_4 v3, v19, v20, v21, v22, .8h + mul_mla_4 v2, v25, v26, v27, v28, .8h + mul_mla_4 v4, v26, v27, v28, v29, .8h + shift_store_16 \type, \d_strd, v1, v2, v3, v4 +0: + ret + +L(\type\()_8tap_v_tbl): + .hword L(\type\()_8tap_v_tbl) - 1280b + .hword L(\type\()_8tap_v_tbl) - 640b + .hword L(\type\()_8tap_v_tbl) - 320b + .hword L(\type\()_8tap_v_tbl) - 160b + .hword L(\type\()_8tap_v_tbl) - 80b + .hword L(\type\()_8tap_v_tbl) - 40b + .hword L(\type\()_8tap_v_tbl) - 20b + .hword 0 + +L(\type\()_8tap_hv): + cmp \h, #4 + ubfm w9, \my, #7, #13 + and \my, \my, #0x7f + b.le 4f + mov \my, w9 +4: + add \xmy, x10, \my, uxtw #3 + + adr x9, L(\type\()_8tap_hv_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +20: +.ifc \type, put + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + b.gt 280f + add \xmy, \xmy, #2 + ld1 {v1.s}[0], [\xmy] + + // 2x2, 2x4 hv + sub \sr2, \src, #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + ld1 {v28.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + ext v29.16b, v28.16b, v28.16b, #2 + mul v28.4h, v28.4h, v0.4h + mul v29.4h, v29.4h, v0.4h + addv h28, v28.4h + addv h29, v29.4h + trn1 v16.4h, v28.4h, v29.4h + srshr v16.4h, v16.4h, #2 + bl L(\type\()_8tap_filter_2) + + trn1 v16.2s, v16.2s, v28.2s + trn1 v17.2s, v28.2s, v30.2s + mov v18.8b, v30.8b + +2: + bl L(\type\()_8tap_filter_2) + + trn1 v18.2s, v18.2s, v28.2s + trn1 v19.2s, v28.2s, v30.2s + smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v19.4h, v1.h[3] + + sqrshrn v2.4h, v2.4s, #\shift_hv + sqxtun v2.8b, v2.8h + subs \h, \h, #2 + st1 {v2.h}[0], [\dst], \d_strd + st1 {v2.h}[1], [\ds2], \d_strd + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v19.8b + mov v18.8b, v30.8b + b 2b + +280: // 2x8, 2x16, 2x32 hv + ld1 {v1.8b}, [\xmy] + sub \src, \src, #1 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + ld1 {v28.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + ext v29.16b, v28.16b, v28.16b, #2 + mul v28.4h, v28.4h, v0.4h + mul v29.4h, v29.4h, v0.4h + addv h28, v28.4h + addv h29, v29.4h + trn1 v16.4h, v28.4h, v29.4h + srshr v16.4h, v16.4h, #2 + + bl L(\type\()_8tap_filter_2) + trn1 v16.2s, v16.2s, v28.2s + trn1 v17.2s, v28.2s, v30.2s + mov v18.8b, v30.8b + bl L(\type\()_8tap_filter_2) + trn1 v18.2s, v18.2s, v28.2s + trn1 v19.2s, v28.2s, v30.2s + mov v20.8b, v30.8b + bl L(\type\()_8tap_filter_2) + trn1 v20.2s, v20.2s, v28.2s + trn1 v21.2s, v28.2s, v30.2s + mov v22.8b, v30.8b + +28: + bl L(\type\()_8tap_filter_2) + trn1 v22.2s, v22.2s, v28.2s + trn1 v23.2s, v28.2s, v30.2s + smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v2.4s, v19.4h, v1.h[3] + smlal v2.4s, v20.4h, v1.h[4] + smlal v2.4s, v21.4h, v1.h[5] + smlal v2.4s, v22.4h, v1.h[6] + smlal v2.4s, v23.4h, v1.h[7] + + sqrshrn v2.4h, v2.4s, #\shift_hv + sqxtun v2.8b, v2.8h + subs \h, \h, #2 + st1 {v2.h}[0], [\dst], \d_strd + st1 {v2.h}[1], [\ds2], \d_strd + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v19.8b + mov v18.8b, v20.8b + mov v19.8b, v21.8b + mov v20.8b, v22.8b + mov v21.8b, v23.8b + mov v22.8b, v30.8b + b 28b + +0: + br x15 + +L(\type\()_8tap_filter_2): + ld1 {v28.8b}, [\sr2], \s_strd + ld1 {v30.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + uxtl v30.8h, v30.8b + ext v29.16b, v28.16b, v28.16b, #2 + ext v31.16b, v30.16b, v30.16b, #2 + trn1 v27.2s, v28.2s, v30.2s + trn2 v30.2s, v28.2s, v30.2s + trn1 v28.2s, v29.2s, v31.2s + trn2 v31.2s, v29.2s, v31.2s + mul v27.4h, v27.4h, v0.h[0] + mla v27.4h, v28.4h, v0.h[1] + mla v27.4h, v30.4h, v0.h[2] + mla v27.4h, v31.4h, v0.h[3] + srshr v28.4h, v27.4h, #2 + trn2 v30.2s, v28.2s, v28.2s + ret +.endif + +40: + add \xmx, \xmx, #2 + ld1 {v0.s}[0], [\xmx] + b.gt 480f + add \xmy, \xmy, #2 + ld1 {v1.s}[0], [\xmy] + sub \sr2, \src, #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + // 4x2, 4x4 hv + ld1 {v26.8b}, [\src], \s_strd + uxtl v26.8h, v26.8b + ext v28.16b, v26.16b, v26.16b, #2 + ext v29.16b, v26.16b, v26.16b, #4 + ext v30.16b, v26.16b, v26.16b, #6 + mul v31.4h, v26.4h, v0.h[0] + mla v31.4h, v28.4h, v0.h[1] + mla v31.4h, v29.4h, v0.h[2] + mla v31.4h, v30.4h, v0.h[3] + srshr v16.4h, v31.4h, #2 + + bl L(\type\()_8tap_filter_4) + mov v17.8b, v28.8b + mov v18.8b, v29.8b + +4: + smull v2.4s, v16.4h, v1.h[0] + bl L(\type\()_8tap_filter_4) + smull v3.4s, v17.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v3.4s, v18.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v3.4s, v28.4h, v1.h[2] + smlal v2.4s, v28.4h, v1.h[3] + smlal v3.4s, v29.4h, v1.h[3] + sqrshrn v2.4h, v2.4s, #\shift_hv + sqrshrn v3.4h, v3.4s, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + st1 {v2.s}[0], [\dst], \d_strd + st1 {v3.s}[0], [\ds2], \d_strd +.else + st1 {v2.4h}, [\dst], \d_strd + st1 {v3.4h}, [\ds2], \d_strd +.endif + b.le 0f + mov v16.16b, v18.16b + mov v17.16b, v28.16b + mov v18.16b, v29.16b + b 4b + +480: // 4x8, 4x16, 4x32 hv + ld1 {v1.8b}, [\xmy] + sub \src, \src, #1 + sub \sr2, \src, \s_strd, lsl #1 + sub \src, \sr2, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + + ld1 {v26.8b}, [\src], \s_strd + uxtl v26.8h, v26.8b + ext v28.16b, v26.16b, v26.16b, #2 + ext v29.16b, v26.16b, v26.16b, #4 + ext v30.16b, v26.16b, v26.16b, #6 + mul v31.4h, v26.4h, v0.h[0] + mla v31.4h, v28.4h, v0.h[1] + mla v31.4h, v29.4h, v0.h[2] + mla v31.4h, v30.4h, v0.h[3] + srshr v16.4h, v31.4h, #2 + + bl L(\type\()_8tap_filter_4) + mov v17.8b, v28.8b + mov v18.8b, v29.8b + bl L(\type\()_8tap_filter_4) + mov v19.8b, v28.8b + mov v20.8b, v29.8b + bl L(\type\()_8tap_filter_4) + mov v21.8b, v28.8b + mov v22.8b, v29.8b + +48: + smull v2.4s, v16.4h, v1.h[0] + bl L(\type\()_8tap_filter_4) + smull v3.4s, v17.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v3.4s, v18.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal v3.4s, v19.4h, v1.h[2] + smlal v2.4s, v19.4h, v1.h[3] + smlal v3.4s, v20.4h, v1.h[3] + smlal v2.4s, v20.4h, v1.h[4] + smlal v3.4s, v21.4h, v1.h[4] + smlal v2.4s, v21.4h, v1.h[5] + smlal v3.4s, v22.4h, v1.h[5] + smlal v2.4s, v22.4h, v1.h[6] + smlal v3.4s, v28.4h, v1.h[6] + smlal v2.4s, v28.4h, v1.h[7] + smlal v3.4s, v29.4h, v1.h[7] + sqrshrn v2.4h, v2.4s, #\shift_hv + sqrshrn v3.4h, v3.4s, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + st1 {v2.s}[0], [\dst], \d_strd + st1 {v3.s}[0], [\ds2], \d_strd +.else + st1 {v2.4h}, [\dst], \d_strd + st1 {v3.4h}, [\ds2], \d_strd +.endif + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v19.8b + mov v18.8b, v20.8b + mov v19.8b, v21.8b + mov v20.8b, v22.8b + mov v21.8b, v28.8b + mov v22.8b, v29.8b + b 48b +0: + br x15 + +L(\type\()_8tap_filter_4): + ld1 {v26.8b}, [\sr2], \s_strd + ld1 {v27.8b}, [\src], \s_strd + uxtl v26.8h, v26.8b + uxtl v27.8h, v27.8b + ext v28.16b, v26.16b, v26.16b, #2 + ext v29.16b, v26.16b, v26.16b, #4 + ext v30.16b, v26.16b, v26.16b, #6 + mul v31.4h, v26.4h, v0.h[0] + mla v31.4h, v28.4h, v0.h[1] + mla v31.4h, v29.4h, v0.h[2] + mla v31.4h, v30.4h, v0.h[3] + ext v28.16b, v27.16b, v27.16b, #2 + ext v29.16b, v27.16b, v27.16b, #4 + ext v30.16b, v27.16b, v27.16b, #6 + mul v27.4h, v27.4h, v0.h[0] + mla v27.4h, v28.4h, v0.h[1] + mla v27.4h, v29.4h, v0.h[2] + mla v27.4h, v30.4h, v0.h[3] + srshr v28.4h, v31.4h, #2 + srshr v29.4h, v27.4h, #2 + ret + +80: +160: +320: + b.gt 880f + add \xmy, \xmy, #2 + ld1 {v0.8b}, [\xmx] + ld1 {v1.s}[0], [\xmy] + sub \src, \src, #3 + sub \src, \src, \s_strd + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + mov \my, \h + +164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + ld1 {v28.8b, v29.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + uxtl v29.8h, v29.8b + mul v24.8h, v28.8h, v0.h[0] +.irpc i, 1234567 + ext v26.16b, v28.16b, v29.16b, #(2*\i) + mla v24.8h, v26.8h, v0.h[\i] +.endr + srshr v16.8h, v24.8h, #2 + + bl L(\type\()_8tap_filter_8) + mov v17.16b, v24.16b + mov v18.16b, v25.16b + +8: + smull v2.4s, v16.4h, v1.h[0] + smull2 v3.4s, v16.8h, v1.h[0] + bl L(\type\()_8tap_filter_8) + smull v4.4s, v17.4h, v1.h[0] + smull2 v5.4s, v17.8h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal2 v3.4s, v17.8h, v1.h[1] + smlal v4.4s, v18.4h, v1.h[1] + smlal2 v5.4s, v18.8h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal2 v3.4s, v18.8h, v1.h[2] + smlal v4.4s, v24.4h, v1.h[2] + smlal2 v5.4s, v24.8h, v1.h[2] + smlal v2.4s, v24.4h, v1.h[3] + smlal2 v3.4s, v24.8h, v1.h[3] + smlal v4.4s, v25.4h, v1.h[3] + smlal2 v5.4s, v25.8h, v1.h[3] + sqrshrn v2.4h, v2.4s, #\shift_hv + sqrshrn2 v2.8h, v3.4s, #\shift_hv + sqrshrn v4.4h, v4.4s, #\shift_hv + sqrshrn2 v4.8h, v5.4s, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + sqxtun v2.8b, v2.8h + sqxtun v4.8b, v4.8h + st1 {v2.8b}, [\dst], \d_strd + st1 {v4.8b}, [\ds2], \d_strd +.else + st1 {v2.8h}, [\dst], \d_strd + st1 {v4.8h}, [\ds2], \d_strd +.endif + b.le 9f + mov v16.16b, v18.16b + mov v17.16b, v24.16b + mov v18.16b, v25.16b + b 8b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #2 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 164b + +880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv +640: +1280: + ld1 {v0.8b}, [\xmx] + ld1 {v1.8b}, [\xmy] + sub \src, \src, #3 + sub \src, \src, \s_strd + sub \src, \src, \s_strd, lsl #1 + sxtl v0.8h, v0.8b + sxtl v1.8h, v1.8b + mov x15, x30 + mov \my, \h + +168: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 + + ld1 {v28.8b, v29.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + uxtl v29.8h, v29.8b + mul v24.8h, v28.8h, v0.h[0] +.irpc i, 1234567 + ext v26.16b, v28.16b, v29.16b, #(2*\i) + mla v24.8h, v26.8h, v0.h[\i] +.endr + srshr v16.8h, v24.8h, #2 + + bl L(\type\()_8tap_filter_8) + mov v17.16b, v24.16b + mov v18.16b, v25.16b + bl L(\type\()_8tap_filter_8) + mov v19.16b, v24.16b + mov v20.16b, v25.16b + bl L(\type\()_8tap_filter_8) + mov v21.16b, v24.16b + mov v22.16b, v25.16b + +88: + smull v2.4s, v16.4h, v1.h[0] + smull2 v3.4s, v16.8h, v1.h[0] + bl L(\type\()_8tap_filter_8) + smull v4.4s, v17.4h, v1.h[0] + smull2 v5.4s, v17.8h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal2 v3.4s, v17.8h, v1.h[1] + smlal v4.4s, v18.4h, v1.h[1] + smlal2 v5.4s, v18.8h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] + smlal2 v3.4s, v18.8h, v1.h[2] + smlal v4.4s, v19.4h, v1.h[2] + smlal2 v5.4s, v19.8h, v1.h[2] + smlal v2.4s, v19.4h, v1.h[3] + smlal2 v3.4s, v19.8h, v1.h[3] + smlal v4.4s, v20.4h, v1.h[3] + smlal2 v5.4s, v20.8h, v1.h[3] + smlal v2.4s, v20.4h, v1.h[4] + smlal2 v3.4s, v20.8h, v1.h[4] + smlal v4.4s, v21.4h, v1.h[4] + smlal2 v5.4s, v21.8h, v1.h[4] + smlal v2.4s, v21.4h, v1.h[5] + smlal2 v3.4s, v21.8h, v1.h[5] + smlal v4.4s, v22.4h, v1.h[5] + smlal2 v5.4s, v22.8h, v1.h[5] + smlal v2.4s, v22.4h, v1.h[6] + smlal2 v3.4s, v22.8h, v1.h[6] + smlal v4.4s, v24.4h, v1.h[6] + smlal2 v5.4s, v24.8h, v1.h[6] + smlal v2.4s, v24.4h, v1.h[7] + smlal2 v3.4s, v24.8h, v1.h[7] + smlal v4.4s, v25.4h, v1.h[7] + smlal2 v5.4s, v25.8h, v1.h[7] + sqrshrn v2.4h, v2.4s, #\shift_hv + sqrshrn2 v2.8h, v3.4s, #\shift_hv + sqrshrn v4.4h, v4.4s, #\shift_hv + sqrshrn2 v4.8h, v5.4s, #\shift_hv + subs \h, \h, #2 +.ifc \type, put + sqxtun v2.8b, v2.8h + sqxtun v4.8b, v4.8h + st1 {v2.8b}, [\dst], \d_strd + st1 {v4.8b}, [\ds2], \d_strd +.else + st1 {v2.8h}, [\dst], \d_strd + st1 {v4.8h}, [\ds2], \d_strd +.endif + b.le 9f + mov v16.16b, v18.16b + mov v17.16b, v19.16b + mov v18.16b, v20.16b + mov v19.16b, v21.16b + mov v20.16b, v22.16b + mov v21.16b, v24.16b + mov v22.16b, v25.16b + b 88b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #3 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 168b +0: + br x15 + +L(\type\()_8tap_filter_8): + ld1 {v28.8b, v29.8b}, [\sr2], \s_strd + ld1 {v30.8b, v31.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + uxtl v29.8h, v29.8b + uxtl v30.8h, v30.8b + uxtl v31.8h, v31.8b + mul v24.8h, v28.8h, v0.h[0] + mul v25.8h, v30.8h, v0.h[0] +.irpc i, 1234567 + ext v26.16b, v28.16b, v29.16b, #(2*\i) + ext v27.16b, v30.16b, v31.16b, #(2*\i) + mla v24.8h, v26.8h, v0.h[\i] + mla v25.8h, v27.8h, v0.h[\i] +.endr + srshr v24.8h, v24.8h, #2 + srshr v25.8h, v25.8h, #2 + ret + +L(\type\()_8tap_hv_tbl): + .hword L(\type\()_8tap_hv_tbl) - 1280b + .hword L(\type\()_8tap_hv_tbl) - 640b + .hword L(\type\()_8tap_hv_tbl) - 320b + .hword L(\type\()_8tap_hv_tbl) - 160b + .hword L(\type\()_8tap_hv_tbl) - 80b + .hword L(\type\()_8tap_hv_tbl) - 40b + .hword L(\type\()_8tap_hv_tbl) - 20b + .hword 0 +endfunc + + +function \type\()_bilin_8bpc_neon, export=1 + dup v1.16b, \mx + dup v3.16b, \my + mov w9, #16 + sub w8, w9, \mx + sub w9, w9, \my + dup v0.16b, w8 + dup v2.16b, w9 +.ifc \type, prep + uxtw \d_strd, \w + lsl \d_strd, \d_strd, #1 +.endif + + clz w8, \w + sub w8, w8, #24 + cbnz \mx, L(\type\()_bilin_h) + cbnz \my, L(\type\()_bilin_v) + b \type + +L(\type\()_bilin_h): + cbnz \my, L(\type\()_bilin_hv) + + adr x9, L(\type\()_bilin_h_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +20: // 2xN h +.ifc \type, put + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +2: + ld1 {v4.s}[0], [\src], \s_strd + ld1 {v6.s}[0], [\sr2], \s_strd + ext v5.8b, v4.8b, v4.8b, #1 + ext v7.8b, v6.8b, v6.8b, #1 + trn1 v4.4h, v4.4h, v6.4h + trn1 v5.4h, v5.4h, v7.4h + subs \h, \h, #2 + umull v4.8h, v4.8b, v0.8b + umlal v4.8h, v5.8b, v1.8b + uqrshrn v4.8b, v4.8h, #4 + st1 {v4.h}[0], [\dst], \d_strd + st1 {v4.h}[1], [\ds2], \d_strd + b.gt 2b + ret +.endif + +40: // 4xN h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +4: + ld1 {v4.8b}, [\src], \s_strd + ld1 {v6.8b}, [\sr2], \s_strd + ext v5.8b, v4.8b, v4.8b, #1 + ext v7.8b, v6.8b, v6.8b, #1 + trn1 v4.2s, v4.2s, v6.2s + trn1 v5.2s, v5.2s, v7.2s + subs \h, \h, #2 + umull v4.8h, v4.8b, v0.8b + umlal v4.8h, v5.8b, v1.8b +.ifc \type, put + uqrshrn v4.8b, v4.8h, #4 + st1 {v4.s}[0], [\dst], \d_strd + st1 {v4.s}[1], [\ds2], \d_strd +.else + st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.d}[1], [\ds2], \d_strd +.endif + b.gt 4b + ret + +80: // 8xN h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \d_strd, \d_strd, #1 + lsl \s_strd, \s_strd, #1 +8: + ld1 {v4.16b}, [\src], \s_strd + ld1 {v6.16b}, [\sr2], \s_strd + ext v5.16b, v4.16b, v4.16b, #1 + ext v7.16b, v6.16b, v6.16b, #1 + subs \h, \h, #2 + umull v4.8h, v4.8b, v0.8b + umull v6.8h, v6.8b, v0.8b + umlal v4.8h, v5.8b, v1.8b + umlal v6.8h, v7.8b, v1.8b +.ifc \type, put + uqrshrn v4.8b, v4.8h, #4 + uqrshrn v6.8b, v6.8h, #4 + st1 {v4.8b}, [\dst], \d_strd + st1 {v6.8b}, [\ds2], \d_strd +.else + st1 {v4.8h}, [\dst], \d_strd + st1 {v6.8h}, [\ds2], \d_strd +.endif + b.gt 8b + ret +160: +320: +640: +1280: // 16xN, 32xN, ... h + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + + sub \s_strd, \s_strd, \w, uxtw + sub \s_strd, \s_strd, #8 +.ifc \type, put + lsl \d_strd, \d_strd, #1 + sub \d_strd, \d_strd, \w, uxtw +.endif +161: + ld1 {v16.d}[1], [\src], #8 + ld1 {v20.d}[1], [\sr2], #8 + mov \mx, \w + +16: + ld1 {v18.16b}, [\src], #16 + ld1 {v22.16b}, [\sr2], #16 + ext v17.16b, v16.16b, v18.16b, #8 + ext v19.16b, v16.16b, v18.16b, #9 + ext v21.16b, v20.16b, v22.16b, #8 + ext v23.16b, v20.16b, v22.16b, #9 + umull v16.8h, v17.8b, v0.8b + umull2 v17.8h, v17.16b, v0.16b + umull v20.8h, v21.8b, v0.8b + umull2 v21.8h, v21.16b, v0.16b + umlal v16.8h, v19.8b, v1.8b + umlal2 v17.8h, v19.16b, v1.16b + umlal v20.8h, v23.8b, v1.8b + umlal2 v21.8h, v23.16b, v1.16b + subs \mx, \mx, #16 +.ifc \type, put + uqrshrn v16.8b, v16.8h, #4 + uqrshrn2 v16.16b, v17.8h, #4 + uqrshrn v20.8b, v20.8h, #4 + uqrshrn2 v20.16b, v21.8h, #4 + st1 {v16.16b}, [\dst], #16 + st1 {v20.16b}, [\ds2], #16 +.else + st1 {v16.8h, v17.8h}, [\dst], #32 + st1 {v20.8h, v21.8h}, [\ds2], #32 +.endif + b.le 9f + + mov v16.16b, v18.16b + mov v20.16b, v22.16b + b 16b + +9: + add \dst, \dst, \d_strd + add \ds2, \ds2, \d_strd + add \src, \src, \s_strd + add \sr2, \sr2, \s_strd + + subs \h, \h, #2 + b.gt 161b + ret + +L(\type\()_bilin_h_tbl): + .hword L(\type\()_bilin_h_tbl) - 1280b + .hword L(\type\()_bilin_h_tbl) - 640b + .hword L(\type\()_bilin_h_tbl) - 320b + .hword L(\type\()_bilin_h_tbl) - 160b + .hword L(\type\()_bilin_h_tbl) - 80b + .hword L(\type\()_bilin_h_tbl) - 40b + .hword L(\type\()_bilin_h_tbl) - 20b + .hword 0 + + +L(\type\()_bilin_v): + cmp \h, #4 + adr x9, L(\type\()_bilin_v_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +20: // 2xN v +.ifc \type, put + cmp \h, #2 + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + // 2x2 v + ld1 {v16.h}[0], [\src], \s_strd + b.gt 24f + ld1 {v17.h}[0], [\sr2], \s_strd + ld1 {v18.h}[0], [\src], \s_strd + trn1 v16.4h, v16.4h, v17.4h + trn1 v17.4h, v17.4h, v18.4h + umull v4.8h, v16.8b, v2.8b + umlal v4.8h, v17.8b, v3.8b + uqrshrn v4.8b, v4.8h, #4 + st1 {v4.h}[0], [\dst] + st1 {v4.h}[1], [\ds2] + ret +24: // 2x4, 2x8, ... v + ld1 {v17.h}[0], [\sr2], \s_strd + ld1 {v18.h}[0], [\src], \s_strd + ld1 {v19.h}[0], [\sr2], \s_strd + ld1 {v20.h}[0], [\src], \s_strd + trn1 v16.4h, v16.4h, v17.4h + trn1 v17.4h, v17.4h, v18.4h + trn1 v18.4h, v18.4h, v19.4h + trn1 v19.4h, v19.4h, v20.4h + trn1 v16.2s, v16.2s, v18.2s + trn1 v17.2s, v17.2s, v19.2s + umull v4.8h, v16.8b, v2.8b + umlal v4.8h, v17.8b, v3.8b + subs \h, \h, #4 + uqrshrn v4.8b, v4.8h, #4 + st1 {v4.h}[0], [\dst], \d_strd + st1 {v4.h}[1], [\ds2], \d_strd + st1 {v4.h}[2], [\dst], \d_strd + st1 {v4.h}[3], [\ds2], \d_strd + b.le 0f + mov v16.8b, v20.8b + b 24b +0: + ret +.endif + +40: // 4xN v + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + ld1 {v16.s}[0], [\src], \s_strd +4: + ld1 {v17.s}[0], [\sr2], \s_strd + ld1 {v18.s}[0], [\src], \s_strd + trn1 v16.2s, v16.2s, v17.2s + trn1 v17.2s, v17.2s, v18.2s + umull v4.8h, v16.8b, v2.8b + umlal v4.8h, v17.8b, v3.8b + subs \h, \h, #2 +.ifc \type, put + uqrshrn v4.8b, v4.8h, #4 + st1 {v4.s}[0], [\dst], \d_strd + st1 {v4.s}[1], [\ds2], \d_strd +.else + st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.d}[1], [\ds2], \d_strd +.endif + b.le 0f + mov v16.8b, v18.8b + b 4b +0: + ret + +80: // 8xN v + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + ld1 {v16.8b}, [\src], \s_strd +8: + ld1 {v17.8b}, [\sr2], \s_strd + ld1 {v18.8b}, [\src], \s_strd + umull v4.8h, v16.8b, v2.8b + umull v5.8h, v17.8b, v2.8b + umlal v4.8h, v17.8b, v3.8b + umlal v5.8h, v18.8b, v3.8b + subs \h, \h, #2 +.ifc \type, put + uqrshrn v4.8b, v4.8h, #4 + uqrshrn v5.8b, v5.8h, #4 + st1 {v4.8b}, [\dst], \d_strd + st1 {v5.8b}, [\ds2], \d_strd +.else + st1 {v4.8h}, [\dst], \d_strd + st1 {v5.8h}, [\ds2], \d_strd +.endif + b.le 0f + mov v16.8b, v18.8b + b 8b +0: + ret + +160: // 16xN, 32xN, ... +320: +640: +1280: + mov \my, \h +1: + add \ds2, \dst, \d_strd + add \sr2, \src, \s_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ld1 {v16.16b}, [\src], \s_strd +2: + ld1 {v17.16b}, [\sr2], \s_strd + ld1 {v18.16b}, [\src], \s_strd + umull v4.8h, v16.8b, v2.8b + umull2 v5.8h, v16.16b, v2.16b + umull v6.8h, v17.8b, v2.8b + umull2 v7.8h, v17.16b, v2.16b + umlal v4.8h, v17.8b, v3.8b + umlal2 v5.8h, v17.16b, v3.16b + umlal v6.8h, v18.8b, v3.8b + umlal2 v7.8h, v18.16b, v3.16b + subs \h, \h, #2 +.ifc \type, put + uqrshrn v4.8b, v4.8h, #4 + uqrshrn2 v4.16b, v5.8h, #4 + uqrshrn v6.8b, v6.8h, #4 + uqrshrn2 v6.16b, v7.8h, #4 + st1 {v4.16b}, [\dst], \d_strd + st1 {v6.16b}, [\ds2], \d_strd +.else + st1 {v4.8h, v5.8h}, [\dst], \d_strd + st1 {v6.8h, v7.8h}, [\ds2], \d_strd +.endif + b.le 9f + mov v16.16b, v18.16b + b 2b +9: + subs \w, \w, #16 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #16 +.ifc \type, put + add \dst, \dst, #16 +.else + add \dst, \dst, #32 +.endif + b 1b +0: + ret + +L(\type\()_bilin_v_tbl): + .hword L(\type\()_bilin_v_tbl) - 1280b + .hword L(\type\()_bilin_v_tbl) - 640b + .hword L(\type\()_bilin_v_tbl) - 320b + .hword L(\type\()_bilin_v_tbl) - 160b + .hword L(\type\()_bilin_v_tbl) - 80b + .hword L(\type\()_bilin_v_tbl) - 40b + .hword L(\type\()_bilin_v_tbl) - 20b + .hword 0 + +L(\type\()_bilin_hv): + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + adr x9, L(\type\()_bilin_hv_tbl) + ldrh w8, [x9, x8, lsl #1] + sub x9, x9, w8, uxtw + br x9 + +20: // 2xN hv +.ifc \type, put + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ld1 {v28.s}[0], [\src], \s_strd + ext v29.8b, v28.8b, v28.8b, #1 + umull v16.8h, v28.8b, v0.8b + umlal v16.8h, v29.8b, v1.8b + +2: + ld1 {v28.s}[0], [\sr2], \s_strd + ld1 {v30.s}[0], [\src], \s_strd + ext v29.8b, v28.8b, v28.8b, #1 + ext v31.8b, v30.8b, v30.8b, #1 + trn1 v28.4h, v28.4h, v30.4h + trn1 v29.4h, v29.4h, v31.4h + umull v17.8h, v28.8b, v0.8b + umlal v17.8h, v29.8b, v1.8b + + trn1 v16.2s, v16.2s, v17.2s + + mul v4.4h, v16.4h, v2.4h + mla v4.4h, v17.4h, v3.4h + uqrshrn v4.8b, v4.8h, #8 + subs \h, \h, #2 + st1 {v4.h}[0], [\dst], \d_strd + st1 {v4.h}[1], [\ds2], \d_strd + b.le 0f + trn2 v16.2s, v17.2s, v17.2s + b 2b +0: + ret +.endif + +40: // 4xN hv + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ld1 {v28.8b}, [\src], \s_strd + ext v29.8b, v28.8b, v28.8b, #1 + umull v16.8h, v28.8b, v0.8b + umlal v16.8h, v29.8b, v1.8b + +4: + ld1 {v28.8b}, [\sr2], \s_strd + ld1 {v30.8b}, [\src], \s_strd + ext v29.8b, v28.8b, v28.8b, #1 + ext v31.8b, v30.8b, v30.8b, #1 + trn1 v28.2s, v28.2s, v30.2s + trn1 v29.2s, v29.2s, v31.2s + umull v17.8h, v28.8b, v0.8b + umlal v17.8h, v29.8b, v1.8b + + trn1 v16.2d, v16.2d, v17.2d + + mul v4.8h, v16.8h, v2.8h + mla v4.8h, v17.8h, v3.8h + subs \h, \h, #2 +.ifc \type, put + uqrshrn v4.8b, v4.8h, #8 + st1 {v4.s}[0], [\dst], \d_strd + st1 {v4.s}[1], [\ds2], \d_strd +.else + urshr v4.8h, v4.8h, #4 + st1 {v4.d}[0], [\dst], \d_strd + st1 {v4.d}[1], [\ds2], \d_strd +.endif + b.le 0f + trn2 v16.2d, v17.2d, v17.2d + b 4b +0: + ret + +80: // 8xN, 16xN, ... hv +160: +320: +640: +1280: + mov \my, \h + +1: + add \sr2, \src, \s_strd + add \ds2, \dst, \d_strd + lsl \s_strd, \s_strd, #1 + lsl \d_strd, \d_strd, #1 + + ld1 {v28.16b}, [\src], \s_strd + ext v29.16b, v28.16b, v28.16b, #1 + umull v16.8h, v28.8b, v0.8b + umlal v16.8h, v29.8b, v1.8b + +2: + ld1 {v28.16b}, [\sr2], \s_strd + ld1 {v30.16b}, [\src], \s_strd + ext v29.16b, v28.16b, v28.16b, #1 + ext v31.16b, v30.16b, v30.16b, #1 + umull v17.8h, v28.8b, v0.8b + umlal v17.8h, v29.8b, v1.8b + umull v18.8h, v30.8b, v0.8b + umlal v18.8h, v31.8b, v1.8b + + mul v4.8h, v16.8h, v2.8h + mla v4.8h, v17.8h, v3.8h + mul v5.8h, v17.8h, v2.8h + mla v5.8h, v18.8h, v3.8h + subs \h, \h, #2 +.ifc \type, put + uqrshrn v4.8b, v4.8h, #8 + uqrshrn v5.8b, v5.8h, #8 + st1 {v4.8b}, [\dst], \d_strd + st1 {v5.8b}, [\ds2], \d_strd +.else + urshr v4.8h, v4.8h, #4 + urshr v5.8h, v5.8h, #4 + st1 {v4.8h}, [\dst], \d_strd + st1 {v5.8h}, [\ds2], \d_strd +.endif + b.le 9f + mov v16.16b, v18.16b + b 2b +9: + subs \w, \w, #8 + b.le 0f + asr \s_strd, \s_strd, #1 + asr \d_strd, \d_strd, #1 + msub \src, \s_strd, \xmy, \src + msub \dst, \d_strd, \xmy, \dst + sub \src, \src, \s_strd, lsl #1 + mov \h, \my + add \src, \src, #8 +.ifc \type, put + add \dst, \dst, #8 +.else + add \dst, \dst, #16 +.endif + b 1b +0: + ret + +L(\type\()_bilin_hv_tbl): + .hword L(\type\()_bilin_hv_tbl) - 1280b + .hword L(\type\()_bilin_hv_tbl) - 640b + .hword L(\type\()_bilin_hv_tbl) - 320b + .hword L(\type\()_bilin_hv_tbl) - 160b + .hword L(\type\()_bilin_hv_tbl) - 80b + .hword L(\type\()_bilin_hv_tbl) - 40b + .hword L(\type\()_bilin_hv_tbl) - 20b + .hword 0 +endfunc +.endm + +filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 +filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 diff --git a/src/arm/asm.S b/src/arm/asm.S index 0b2291b..e5722cf 100644 --- a/src/arm/asm.S +++ b/src/arm/asm.S @@ -129,4 +129,6 @@ EXTERN\name: #define L(x) .L ## x #endif +#define X(x) CONCAT(EXTERN, x) + #endif /* __DAV1D_SRC_ARM_ASM_S__ */ diff --git a/src/arm/mc_init_tmpl.c b/src/arm/mc_init_tmpl.c index cf3a5fd..0e34854 100644 --- a/src/arm/mc_init_tmpl.c +++ b/src/arm/mc_init_tmpl.c @@ -30,16 +30,66 @@ #include "src/mc.h" #include "src/cpu.h" +decl_mc_fn(dav1d_put_8tap_regular_8bpc_neon); +decl_mc_fn(dav1d_put_8tap_regular_smooth_8bpc_neon); +decl_mc_fn(dav1d_put_8tap_regular_sharp_8bpc_neon); +decl_mc_fn(dav1d_put_8tap_smooth_8bpc_neon); +decl_mc_fn(dav1d_put_8tap_smooth_regular_8bpc_neon); +decl_mc_fn(dav1d_put_8tap_smooth_sharp_8bpc_neon); +decl_mc_fn(dav1d_put_8tap_sharp_8bpc_neon); +decl_mc_fn(dav1d_put_8tap_sharp_regular_8bpc_neon); +decl_mc_fn(dav1d_put_8tap_sharp_smooth_8bpc_neon); +decl_mc_fn(dav1d_put_bilin_8bpc_neon); + +decl_mct_fn(dav1d_prep_8tap_regular_8bpc_neon); +decl_mct_fn(dav1d_prep_8tap_regular_smooth_8bpc_neon); +decl_mct_fn(dav1d_prep_8tap_regular_sharp_8bpc_neon); +decl_mct_fn(dav1d_prep_8tap_smooth_8bpc_neon); +decl_mct_fn(dav1d_prep_8tap_smooth_regular_8bpc_neon); +decl_mct_fn(dav1d_prep_8tap_smooth_sharp_8bpc_neon); +decl_mct_fn(dav1d_prep_8tap_sharp_8bpc_neon); +decl_mct_fn(dav1d_prep_8tap_sharp_regular_8bpc_neon); +decl_mct_fn(dav1d_prep_8tap_sharp_smooth_8bpc_neon); +decl_mct_fn(dav1d_prep_bilin_8bpc_neon); + decl_avg_fn(dav1d_avg_8bpc_neon); decl_w_avg_fn(dav1d_w_avg_8bpc_neon); decl_mask_fn(dav1d_mask_8bpc_neon); void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) { +#define init_mc_fn(type, name, suffix) \ + c->mc[type] = dav1d_put_##name##_8bpc_##suffix +#define init_mct_fn(type, name, suffix) \ + c->mct[type] = dav1d_prep_##name##_8bpc_##suffix const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; #if BITDEPTH == 8 +#if ARCH_AARCH64 + init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); + init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon); + init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon); + init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon); + init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon); + init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon); + init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, neon); + init_mc_fn (FILTER_2D_BILINEAR, bilin, neon); + + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon); + init_mct_fn(FILTER_2D_BILINEAR, bilin, neon); +#endif + c->avg = dav1d_avg_8bpc_neon; c->w_avg = dav1d_w_avg_8bpc_neon; c->mask = dav1d_mask_8bpc_neon; |