diff options
author | Martin Storsjö <martin@martin.st> | 2020-02-18 01:10:21 +0300 |
---|---|---|
committer | Jean-Baptiste Kempf <jb@videolan.org> | 2020-03-02 12:58:11 +0300 |
commit | 360243c2cafb9c3aef10f129c0c1373ffd708ede (patch) | |
tree | 94744daf5922be7349da1a887a229ee59a718483 /src/arm | |
parent | ebbf91f44422c7a2778a9e524a9cc7a1b5c66dcf (diff) |
arm64: loopfilter: NEON implementation of loopfilter for 16 bpc
Checkasm runtimes: Cortex A53 A72 A73
lpf_h_sb_uv_w4_16bpc_neon: 919.0 795.0 714.9
lpf_h_sb_uv_w6_16bpc_neon: 1267.7 1116.2 1081.9
lpf_h_sb_y_w4_16bpc_neon: 1500.2 1543.9 1778.5
lpf_h_sb_y_w8_16bpc_neon: 2216.1 2183.0 2568.1
lpf_h_sb_y_w16_16bpc_neon: 2641.8 2630.4 2639.4
lpf_v_sb_uv_w4_16bpc_neon: 836.5 572.7 667.3
lpf_v_sb_uv_w6_16bpc_neon: 1130.8 709.1 955.5
lpf_v_sb_y_w4_16bpc_neon: 1271.6 1434.4 1272.1
lpf_v_sb_y_w8_16bpc_neon: 1818.0 1759.1 1664.6
lpf_v_sb_y_w16_16bpc_neon: 1998.6 2115.8 1586.6
Corresponding numbers for 8 bpc for comparison:
lpf_h_sb_uv_w4_8bpc_neon: 799.4 632.8 695.4
lpf_h_sb_uv_w6_8bpc_neon: 1067.3 613.6 767.5
lpf_h_sb_y_w4_8bpc_neon: 1490.5 1179.1 1018.9
lpf_h_sb_y_w8_8bpc_neon: 1892.9 1382.0 1172.0
lpf_h_sb_y_w16_8bpc_neon: 2117.4 1625.4 1739.0
lpf_v_sb_uv_w4_8bpc_neon: 447.1 447.7 446.0
lpf_v_sb_uv_w6_8bpc_neon: 522.1 529.0 513.1
lpf_v_sb_y_w4_8bpc_neon: 1043.7 785.0 775.9
lpf_v_sb_y_w8_8bpc_neon: 1500.4 1115.9 881.2
lpf_v_sb_y_w16_8bpc_neon: 1493.5 1371.4 1248.5
Diffstat (limited to 'src/arm')
-rw-r--r-- | src/arm/64/loopfilter16.S | 907 | ||||
-rw-r--r-- | src/arm/loopfilter_init_tmpl.c | 2 |
2 files changed, 908 insertions, 1 deletions
diff --git a/src/arm/64/loopfilter16.S b/src/arm/64/loopfilter16.S new file mode 100644 index 0000000..a731918 --- /dev/null +++ b/src/arm/64/loopfilter16.S @@ -0,0 +1,907 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +.macro loop_filter wd +function lpf_8_wd\wd\()_neon + uabd v0.8h, v22.8h, v23.8h // abs(p1 - p0) + uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0) + uabd v2.8h, v23.8h, v24.8h // abs(p0 - q0) + uabd v3.8h, v22.8h, v25.8h // abs(p1 - q1) +.if \wd >= 6 + uabd v4.8h, v21.8h, v22.8h // abs(p2 - p1) + uabd v5.8h, v26.8h, v25.8h // abs(q2 - q1) +.endif +.if \wd >= 8 + uabd v6.8h, v20.8h, v21.8h // abs(p3 - p2) + uabd v7.8h, v27.8h, v26.8h // abs(q3 - q3) +.endif +.if \wd >= 6 + umax v4.8h, v4.8h, v5.8h +.endif + uqadd v2.8h, v2.8h, v2.8h // abs(p0 - q0) * 2 +.if \wd >= 8 + umax v6.8h, v6.8h, v7.8h +.endif + ushr v3.8h, v3.8h, #1 +.if \wd >= 8 + umax v4.8h, v4.8h, v6.8h +.endif +.if \wd >= 6 + and v4.16b, v4.16b, v14.16b +.endif + umax v0.8h, v0.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0)) + uqadd v2.8h, v2.8h, v3.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 +.if \wd >= 6 + umax v4.8h, v0.8h, v4.8h + cmhs v1.8h, v11.8h, v4.8h // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I +.else + cmhs v1.8h, v11.8h, v0.8h // max(abs(p1 - p0), abs(q1 - q0)) <= I +.endif + cmhs v2.8h, v10.8h, v2.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E + and v1.16b, v1.16b, v2.16b // fm + and v1.16b, v1.16b, v13.16b // fm && wd >= 4 +.if \wd >= 6 + and v14.16b, v14.16b, v1.16b // fm && wd > 4 +.endif +.if \wd >= 16 + and v15.16b, v15.16b, v1.16b // fm && wd == 16 +.endif + + mov x16, v1.d[0] + mov x17, v1.d[1] + adds x16, x16, x17 + b.eq 9f // if (!fm || wd < 4) return; + +.if \wd >= 6 + movi v10.8h, #1 + uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0) + uabd v3.8h, v22.8h, v23.8h // abs(p1 - p0) + uabd v4.8h, v25.8h, v24.8h // abs(q1 - q0) + uabd v5.8h, v26.8h, v24.8h // abs(q2 - q0) + dup v9.8h, w9 // bitdepth_min_8 +.if \wd >= 8 + uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0) + uabd v7.8h, v27.8h, v24.8h // abs(q3 - q0) +.endif + umax v2.8h, v2.8h, v3.8h + umax v4.8h, v4.8h, v5.8h +.if \wd >= 8 + umax v6.8h, v6.8h, v7.8h +.endif + umax v2.8h, v2.8h, v4.8h + ushl v10.8h, v10.8h, v9.8h // F = 1 << bitdepth_min_8 +.if \wd >= 8 + umax v2.8h, v2.8h, v6.8h +.endif + +.if \wd == 16 + uabd v3.8h, v17.8h, v23.8h // abs(p6 - p0) + uabd v4.8h, v18.8h, v23.8h // abs(p5 - p0) + uabd v5.8h, v19.8h, v23.8h // abs(p4 - p0) +.endif + cmhs v2.8h, v10.8h, v2.8h // flat8in +.if \wd == 16 + uabd v6.8h, v28.8h, v24.8h // abs(q4 - q0) + uabd v7.8h, v29.8h, v24.8h // abs(q5 - q0) + uabd v8.8h, v30.8h, v24.8h // abs(q6 - q0) +.endif + and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4 + bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in +.if \wd == 16 + umax v3.8h, v3.8h, v4.8h + umax v5.8h, v5.8h, v6.8h +.endif + mov x16, v1.d[0] + mov x17, v1.d[1] +.if \wd == 16 + umax v7.8h, v7.8h, v8.8h + umax v3.8h, v3.8h, v5.8h + umax v3.8h, v3.8h, v7.8h + cmhs v3.8h, v10.8h, v3.8h // flat8out +.endif + adds x16, x16, x17 +.if \wd == 16 + and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16 + and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16 + bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out +.endif + b.eq 1f // skip wd == 4 case +.endif + + dup v3.8h, w8 // bitdepth_max + sub v2.8h, v22.8h, v25.8h // p1 - q1 + ushr v3.8h, v3.8h, #1 // 128 << bitdepth_min_8 - 1 + cmhi v0.8h, v0.8h, v12.8h // hev + not v9.16b, v3.16b // - 128 * (1 << bitdepth_min_8) + smin v2.8h, v2.8h, v3.8h // iclip_diff(p1 - q1) + smax v2.8h, v2.8h, v9.8h // iclip_diff(p1 - q1) + and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1) + sub v2.8h, v24.8h, v23.8h + movi v5.8h, #3 + bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev) + mul v2.8h, v2.8h, v5.8h + movi v6.8h, #4 + add v2.8h, v2.8h, v4.8h + smin v2.8h, v2.8h, v3.8h // f = iclip_diff() + movi v7.8h, #3 + smax v2.8h, v2.8h, v9.8h // f = iclip_diff() + sqadd v4.8h, v6.8h, v2.8h // f + 4 + sqadd v5.8h, v7.8h, v2.8h // f + 3 + smin v4.8h, v4.8h, v3.8h // imin(f + 4, 128 << bitdepth_min_8 - 1) + smin v5.8h, v5.8h, v3.8h // imin(f + 3, 128 << bitdepth_min_8 - 1) + sshr v4.8h, v4.8h, #3 // f1 + sshr v5.8h, v5.8h, #3 // f2 + movi v9.8h, #0 + dup v3.8h, w8 // bitdepth_max + sqadd v2.8h, v23.8h, v5.8h // p0 + f2 + sqsub v6.8h, v24.8h, v4.8h // q0 - f1 + srshr v4.8h, v4.8h, #1 // (f1 + 1) >> 1 + smin v2.8h, v2.8h, v3.8h // out p0 = iclip_pixel() + smin v6.8h, v6.8h, v3.8h // out q0 = iclip_pixel() + smax v2.8h, v2.8h, v9.8h // out p0 = iclip_pixel() + smax v6.8h, v6.8h, v9.8h // out q0 = iclip_pixel() + bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4) + bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4) + sqadd v2.8h, v22.8h, v4.8h // p1 + f + sqsub v6.8h, v25.8h, v4.8h // q1 - f + smin v2.8h, v2.8h, v3.8h // out p1 = iclip_pixel() + smin v6.8h, v6.8h, v3.8h // out q1 = iclip_pixel() + smax v2.8h, v2.8h, v9.8h // out p1 = iclip_pixel() + smax v6.8h, v6.8h, v9.8h // out q1 = iclip_pixel() + bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev) + bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev) +1: + +.if \wd == 6 + mov x16, v14.d[0] + mov x17, v14.d[1] + adds x16, x16, x17 + b.eq 2f // skip if there's no flat8in + + add v0.8h, v21.8h, v21.8h // p2 * 2 + add v2.8h, v21.8h, v22.8h // p2 + p1 + add v4.8h, v22.8h, v23.8h // p1 + p0 + add v6.8h, v23.8h, v24.8h // p0 + q0 + add v8.8h, v0.8h, v2.8h + add v10.8h, v4.8h, v6.8h + add v12.8h, v24.8h, v25.8h // q0 + q1 + add v8.8h, v8.8h, v10.8h + sub v12.8h, v12.8h, v0.8h + add v10.8h, v25.8h, v26.8h // q1 + q2 + urshr v0.8h, v8.8h, #3 // out p1 + + add v8.8h, v8.8h, v12.8h + sub v10.8h, v10.8h, v2.8h + add v12.8h, v26.8h, v26.8h // q2 + q2 + urshr v1.8h, v8.8h, #3 // out p0 + + add v8.8h, v8.8h, v10.8h + sub v12.8h, v12.8h, v4.8h + urshr v2.8h, v8.8h, #3 // out q0 + + bit v22.16b, v0.16b, v14.16b // p1 if (flat8in) + add v8.8h, v8.8h, v12.8h + bit v23.16b, v1.16b, v14.16b // p0 if (flat8in) + urshr v3.8h, v8.8h, #3 // out q1 + bit v24.16b, v2.16b, v14.16b // q0 if (flat8in) + bit v25.16b, v3.16b, v14.16b // q1 if (flat8in) +.elseif \wd >= 8 + mov x16, v14.d[0] + mov x17, v14.d[1] + adds x16, x16, x17 +.if \wd == 8 + b.eq 8f // skip if there's no flat8in +.else + b.eq 2f // skip if there's no flat8in +.endif + + add v0.8h, v20.8h, v21.8h // p3 + p2 + add v2.8h, v22.8h, v25.8h // p1 + q1 + add v4.8h, v20.8h, v22.8h // p3 + p1 + add v6.8h, v23.8h, v26.8h // p0 + q2 + add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2) + add v9.8h, v23.8h, v24.8h // p0 + q0 + add v8.8h, v8.8h, v4.8h // + p3 + p1 + sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2 + add v8.8h, v8.8h, v9.8h // + p0 + q0 + sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1 + urshr v10.8h, v8.8h, #3 // out p2 + + add v8.8h, v8.8h, v2.8h + add v0.8h, v20.8h, v23.8h // p3 + p0 + add v2.8h, v24.8h, v27.8h // q0 + q3 + urshr v11.8h, v8.8h, #3 // out p1 + + add v8.8h, v8.8h, v6.8h + sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0 + add v4.8h, v21.8h, v24.8h // p2 + q0 + add v6.8h, v25.8h, v27.8h // q1 + q3 + urshr v12.8h, v8.8h, #3 // out p0 + + add v8.8h, v8.8h, v2.8h + sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0 + add v0.8h, v22.8h, v25.8h // p1 + q1 + add v2.8h, v26.8h, v27.8h // q2 + q3 + urshr v13.8h, v8.8h, #3 // out q0 + + add v8.8h, v8.8h, v6.8h + sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1 + urshr v0.8h, v8.8h, #3 // out q1 + + add v8.8h, v8.8h, v2.8h + + bit v21.16b, v10.16b, v14.16b + bit v22.16b, v11.16b, v14.16b + bit v23.16b, v12.16b, v14.16b + urshr v1.8h, v8.8h, #3 // out q2 + bit v24.16b, v13.16b, v14.16b + bit v25.16b, v0.16b, v14.16b + bit v26.16b, v1.16b, v14.16b +.endif +2: +.if \wd == 16 + mov x16, v15.d[0] + mov x17, v15.d[1] + adds x16, x16, x17 + b.ne 1f // check if flat8out is needed + mov x16, v14.d[0] + mov x17, v14.d[1] + adds x16, x16, x17 + b.eq 8f // if there was no flat8in, just write the inner 4 pixels + b 7f // if flat8in was used, write the inner 6 pixels +1: + + add v2.8h, v17.8h, v17.8h // p6 + p6 + add v4.8h, v17.8h, v18.8h // p6 + p5 + add v6.8h, v17.8h, v19.8h // p6 + p4 + add v8.8h, v17.8h, v20.8h // p6 + p3 + add v12.8h, v2.8h, v4.8h + add v10.8h, v6.8h, v8.8h + add v6.8h, v17.8h, v21.8h // p6 + p2 + add v12.8h, v12.8h, v10.8h + add v8.8h, v17.8h, v22.8h // p6 + p1 + add v10.8h, v18.8h, v23.8h // p5 + p0 + add v6.8h, v6.8h, v8.8h + add v8.8h, v19.8h, v24.8h // p4 + q0 + add v12.8h, v12.8h, v6.8h + add v10.8h, v10.8h, v8.8h + add v6.8h, v20.8h, v25.8h // p3 + q1 + add v12.8h, v12.8h, v10.8h + sub v6.8h, v6.8h, v2.8h + add v2.8h, v21.8h, v26.8h // p2 + q2 + urshr v0.8h, v12.8h, #4 // out p5 + add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1) + sub v2.8h, v2.8h, v4.8h + add v4.8h, v22.8h, v27.8h // p1 + q3 + add v6.8h, v17.8h, v19.8h // p6 + p4 + urshr v1.8h, v12.8h, #4 // out p4 + add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2) + sub v4.8h, v4.8h, v6.8h + add v6.8h, v23.8h, v28.8h // p0 + q4 + add v8.8h, v17.8h, v20.8h // p6 + p3 + urshr v2.8h, v12.8h, #4 // out p3 + add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3) + sub v6.8h, v6.8h, v8.8h + add v8.8h, v24.8h, v29.8h // q0 + q5 + add v4.8h, v17.8h, v21.8h // p6 + p2 + urshr v3.8h, v12.8h, #4 // out p2 + add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4) + sub v8.8h, v8.8h, v4.8h + add v6.8h, v25.8h, v30.8h // q1 + q6 + add v10.8h, v17.8h, v22.8h // p6 + p1 + urshr v4.8h, v12.8h, #4 // out p1 + add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5) + sub v6.8h, v6.8h, v10.8h + add v8.8h, v26.8h, v30.8h // q2 + q6 + bif v0.16b, v18.16b, v15.16b // out p5 + add v10.8h, v18.8h, v23.8h // p5 + p0 + urshr v5.8h, v12.8h, #4 // out p0 + add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6) + sub v8.8h, v8.8h, v10.8h + add v10.8h, v27.8h, v30.8h // q3 + q6 + bif v1.16b, v19.16b, v15.16b // out p4 + add v18.8h, v19.8h, v24.8h // p4 + q0 + urshr v6.8h, v12.8h, #4 // out q0 + add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6) + sub v10.8h, v10.8h, v18.8h + add v8.8h, v28.8h, v30.8h // q4 + q6 + bif v2.16b, v20.16b, v15.16b // out p3 + add v18.8h, v20.8h, v25.8h // p3 + q1 + urshr v7.8h, v12.8h, #4 // out q1 + add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6) + sub v18.8h, v8.8h, v18.8h + add v10.8h, v29.8h, v30.8h // q5 + q6 + bif v3.16b, v21.16b, v15.16b // out p2 + add v20.8h, v21.8h, v26.8h // p2 + q2 + urshr v8.8h, v12.8h, #4 // out q2 + add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6) + sub v10.8h, v10.8h, v20.8h + add v18.8h, v30.8h, v30.8h // q6 + q6 + bif v4.16b, v22.16b, v15.16b // out p1 + add v20.8h, v22.8h, v27.8h // p1 + q3 + urshr v9.8h, v12.8h, #4 // out q3 + add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6) + sub v18.8h, v18.8h, v20.8h + bif v5.16b, v23.16b, v15.16b // out p0 + urshr v10.8h, v12.8h, #4 // out q4 + add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6) + urshr v11.8h, v12.8h, #4 // out q5 + bif v6.16b, v24.16b, v15.16b // out q0 + bif v7.16b, v25.16b, v15.16b // out q1 + bif v8.16b, v26.16b, v15.16b // out q2 + bif v9.16b, v27.16b, v15.16b // out q3 + bif v10.16b, v28.16b, v15.16b // out q4 + bif v11.16b, v29.16b, v15.16b // out q5 +.endif + + ret +.if \wd == 16 +7: + // Return to a shorter epilogue, writing only the inner 6 pixels + br x13 +.endif +.if \wd >= 8 +8: + // Return to a shorter epilogue, writing only the inner 4 pixels + br x14 +.endif +9: + // Return directly without writing back any pixels + br x15 +endfunc +.endm + +loop_filter 16 +loop_filter 8 +loop_filter 6 +loop_filter 4 + +.macro lpf_8_wd16 + adr x13, 7f + adr x14, 8f + bl lpf_8_wd16_neon +.endm + +.macro lpf_8_wd8 + adr x14, 8f + bl lpf_8_wd8_neon +.endm + +.macro lpf_8_wd6 + bl lpf_8_wd6_neon +.endm + +.macro lpf_8_wd4 + bl lpf_8_wd4_neon +.endm + +function lpf_v_4_8_neon + mov x15, x30 + sub x16, x0, x1, lsl #1 + ld1 {v22.8h}, [x16], x1 // p1 + ld1 {v24.8h}, [x0], x1 // q0 + ld1 {v23.8h}, [x16], x1 // p0 + ld1 {v25.8h}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + + lpf_8_wd4 + + sub x16, x0, x1, lsl #1 + st1 {v22.8h}, [x16], x1 // p1 + st1 {v24.8h}, [x0], x1 // q0 + st1 {v23.8h}, [x16], x1 // p0 + st1 {v25.8h}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + br x15 +endfunc + +function lpf_h_4_8_neon + mov x15, x30 + sub x16, x0, #4 + add x0, x16, x1, lsl #2 + ld1 {v22.d}[0], [x16], x1 + ld1 {v22.d}[1], [x0], x1 + ld1 {v23.d}[0], [x16], x1 + ld1 {v23.d}[1], [x0], x1 + ld1 {v24.d}[0], [x16], x1 + ld1 {v24.d}[1], [x0], x1 + ld1 {v25.d}[0], [x16], x1 + ld1 {v25.d}[1], [x0], x1 + add x0, x0, #4 + + transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 + + lpf_8_wd4 + + sub x16, x0, x1, lsl #3 + sub x16, x16, #4 + transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #2 + + st1 {v22.d}[0], [x16], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x16], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x16], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x16], x1 + st1 {v25.d}[1], [x0], x1 + add x0, x0, #4 + br x15 +endfunc + +function lpf_v_6_8_neon + mov x15, x30 + sub x16, x0, x1, lsl #1 + sub x16, x16, x1 + ld1 {v21.8h}, [x16], x1 // p2 + ld1 {v24.8h}, [x0], x1 // q0 + ld1 {v22.8h}, [x16], x1 // p1 + ld1 {v25.8h}, [x0], x1 // q1 + ld1 {v23.8h}, [x16], x1 // p0 + ld1 {v26.8h}, [x0], x1 // q2 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + + lpf_8_wd6 + + sub x16, x0, x1, lsl #1 + st1 {v22.8h}, [x16], x1 // p1 + st1 {v24.8h}, [x0], x1 // q0 + st1 {v23.8h}, [x16], x1 // p0 + st1 {v25.8h}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + br x15 +endfunc + +function lpf_h_6_8_neon + mov x15, x30 + sub x16, x0, #8 + add x0, x16, x1, lsl #2 + ld1 {v20.8h}, [x16], x1 + ld1 {v24.8h}, [x0], x1 + ld1 {v21.8h}, [x16], x1 + ld1 {v25.8h}, [x0], x1 + ld1 {v22.8h}, [x16], x1 + ld1 {v26.8h}, [x0], x1 + ld1 {v23.8h}, [x16], x1 + ld1 {v27.8h}, [x0], x1 + add x0, x0, #8 + + transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + lpf_8_wd6 + + sub x16, x0, x1, lsl #3 + sub x16, x16, #4 + transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #2 + + st1 {v22.d}[0], [x16], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x16], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x16], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x16], x1 + st1 {v25.d}[1], [x0], x1 + add x0, x0, #4 + br x15 +endfunc + +function lpf_v_8_8_neon + mov x15, x30 + sub x16, x0, x1, lsl #2 + ld1 {v20.8h}, [x16], x1 // p3 + ld1 {v24.8h}, [x0], x1 // q0 + ld1 {v21.8h}, [x16], x1 // p2 + ld1 {v25.8h}, [x0], x1 // q1 + ld1 {v22.8h}, [x16], x1 // p1 + ld1 {v26.8h}, [x0], x1 // q2 + ld1 {v23.8h}, [x16], x1 // p0 + ld1 {v27.8h}, [x0], x1 // q3 + sub x0, x0, x1, lsl #2 + + lpf_8_wd8 + + sub x16, x0, x1, lsl #1 + sub x16, x16, x1 + st1 {v21.8h}, [x16], x1 // p2 + st1 {v24.8h}, [x0], x1 // q0 + st1 {v22.8h}, [x16], x1 // p1 + st1 {v25.8h}, [x0], x1 // q1 + st1 {v23.8h}, [x16], x1 // p0 + st1 {v26.8h}, [x0], x1 // q2 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + br x15 + +8: + sub x16, x0, x1, lsl #1 + st1 {v22.8h}, [x16], x1 // p1 + st1 {v24.8h}, [x0], x1 // q0 + st1 {v23.8h}, [x16], x1 // p0 + st1 {v25.8h}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + br x15 +endfunc + +function lpf_h_8_8_neon + mov x15, x30 + sub x16, x0, #8 + add x0, x16, x1, lsl #2 + ld1 {v20.8h}, [x16], x1 + ld1 {v24.8h}, [x0], x1 + ld1 {v21.8h}, [x16], x1 + ld1 {v25.8h}, [x0], x1 + ld1 {v22.8h}, [x16], x1 + ld1 {v26.8h}, [x0], x1 + ld1 {v23.8h}, [x16], x1 + ld1 {v27.8h}, [x0], x1 + add x0, x0, #8 + + transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + lpf_8_wd8 + + sub x16, x0, x1, lsl #3 + sub x16, x16, #8 + transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #2 + + st1 {v20.8h}, [x16], x1 + st1 {v24.8h}, [x0], x1 + st1 {v21.8h}, [x16], x1 + st1 {v25.8h}, [x0], x1 + st1 {v22.8h}, [x16], x1 + st1 {v26.8h}, [x0], x1 + st1 {v23.8h}, [x16], x1 + st1 {v27.8h}, [x0], x1 + add x0, x0, #8 + br x15 +8: + sub x16, x0, x1, lsl #3 + sub x16, x16, #4 + transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #2 + + st1 {v22.d}[0], [x16], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x16], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x16], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x16], x1 + st1 {v25.d}[1], [x0], x1 + add x0, x0, #4 + br x15 +endfunc + +function lpf_v_16_8_neon + mov x15, x30 + + sub x16, x0, x1, lsl #3 + add x16, x16, x1 + ld1 {v17.8h}, [x16], x1 // p6 + ld1 {v24.8h}, [x0], x1 // q0 + ld1 {v18.8h}, [x16], x1 // p5 + ld1 {v25.8h}, [x0], x1 // q1 + ld1 {v19.8h}, [x16], x1 // p4 + ld1 {v26.8h}, [x0], x1 // q2 + ld1 {v20.8h}, [x16], x1 // p3 + ld1 {v27.8h}, [x0], x1 // q3 + ld1 {v21.8h}, [x16], x1 // p2 + ld1 {v28.8h}, [x0], x1 // q4 + ld1 {v22.8h}, [x16], x1 // p1 + ld1 {v29.8h}, [x0], x1 // q5 + ld1 {v23.8h}, [x16], x1 // p0 + ld1 {v30.8h}, [x0], x1 // q6 + sub x0, x0, x1, lsl #3 + add x0, x0, x1 + + lpf_8_wd16 + + sub x16, x0, x1, lsl #2 + sub x16, x16, x1, lsl #1 + st1 {v0.8h}, [x16], x1 // p5 + st1 {v6.8h}, [x0], x1 // q0 + st1 {v1.8h}, [x16], x1 // p4 + st1 {v7.8h}, [x0], x1 // q1 + st1 {v2.8h}, [x16], x1 // p3 + st1 {v8.8h}, [x0], x1 // q2 + st1 {v3.8h}, [x16], x1 // p2 + st1 {v9.8h}, [x0], x1 // q3 + st1 {v4.8h}, [x16], x1 // p1 + st1 {v10.8h}, [x0], x1 // q4 + st1 {v5.8h}, [x16], x1 // p0 + st1 {v11.8h}, [x0], x1 // q5 + sub x0, x0, x1, lsl #2 + sub x0, x0, x1, lsl #1 + br x15 +7: + sub x16, x0, x1 + sub x16, x16, x1, lsl #1 + st1 {v21.8h}, [x16], x1 // p2 + st1 {v24.8h}, [x0], x1 // q0 + st1 {v22.8h}, [x16], x1 // p1 + st1 {v25.8h}, [x0], x1 // q1 + st1 {v23.8h}, [x16], x1 // p0 + st1 {v26.8h}, [x0], x1 // q2 + sub x0, x0, x1, lsl #1 + sub x0, x0, x1 + br x15 + +8: + sub x16, x0, x1, lsl #1 + st1 {v22.8h}, [x16], x1 // p1 + st1 {v24.8h}, [x0], x1 // q0 + st1 {v23.8h}, [x16], x1 // p0 + st1 {v25.8h}, [x0], x1 // q1 + sub x0, x0, x1, lsl #1 + br x15 +endfunc + +function lpf_h_16_8_neon + mov x15, x30 + sub x16, x0, #16 + ld1 {v16.8h}, [x16], x1 + ld1 {v24.8h}, [x0], x1 + ld1 {v17.8h}, [x16], x1 + ld1 {v25.8h}, [x0], x1 + ld1 {v18.8h}, [x16], x1 + ld1 {v26.8h}, [x0], x1 + ld1 {v19.8h}, [x16], x1 + ld1 {v27.8h}, [x0], x1 + ld1 {v20.8h}, [x16], x1 + ld1 {v28.8h}, [x0], x1 + ld1 {v21.8h}, [x16], x1 + ld1 {v29.8h}, [x0], x1 + ld1 {v22.8h}, [x16], x1 + ld1 {v30.8h}, [x0], x1 + ld1 {v23.8h}, [x16], x1 + ld1 {v31.8h}, [x0], x1 + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 + transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 + + lpf_8_wd16 + + sub x0, x0, x1, lsl #3 + sub x16, x0, #16 + + transpose_8x8h v16, v17, v0, v1, v2, v3, v4, v5, v18, v19 + transpose_8x8h v6, v7, v8, v9, v10, v11, v30, v31, v18, v19 + + st1 {v16.8h}, [x16], x1 + st1 {v6.8h}, [x0], x1 + st1 {v17.8h}, [x16], x1 + st1 {v7.8h}, [x0], x1 + st1 {v0.8h}, [x16], x1 + st1 {v8.8h}, [x0], x1 + st1 {v1.8h}, [x16], x1 + st1 {v9.8h}, [x0], x1 + st1 {v2.8h}, [x16], x1 + st1 {v10.8h}, [x0], x1 + st1 {v3.8h}, [x16], x1 + st1 {v11.8h}, [x0], x1 + st1 {v4.8h}, [x16], x1 + st1 {v30.8h}, [x0], x1 + st1 {v5.8h}, [x16], x1 + st1 {v31.8h}, [x0], x1 + br x15 + +7: + sub x16, x0, x1, lsl #3 + sub x16, x16, #8 + transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #2 + + st1 {v20.8h}, [x16], x1 + st1 {v24.8h}, [x0], x1 + st1 {v21.8h}, [x16], x1 + st1 {v25.8h}, [x0], x1 + st1 {v22.8h}, [x16], x1 + st1 {v26.8h}, [x0], x1 + st1 {v23.8h}, [x16], x1 + st1 {v27.8h}, [x0], x1 + add x0, x0, #8 + br x15 +8: + sub x16, x0, x1, lsl #3 + sub x16, x16, #4 + transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 + add x0, x16, x1, lsl #2 + + st1 {v22.d}[0], [x16], x1 + st1 {v22.d}[1], [x0], x1 + st1 {v23.d}[0], [x16], x1 + st1 {v23.d}[1], [x0], x1 + st1 {v24.d}[0], [x16], x1 + st1 {v24.d}[1], [x0], x1 + st1 {v25.d}[0], [x16], x1 + st1 {v25.d}[1], [x0], x1 + add x0, x0, #4 + br x15 +endfunc + +// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint32_t *const vmask, +// const uint8_t (*l)[4], ptrdiff_t b4_stride, +// const Av1FilterLUT *lut, const int w, +// const int bitdepth_max) + +.macro lpf_func dir, type +function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1 + mov x11, x30 + mov w8, w7 // bitdepth_max + clz w9, w8 + mov w10, #24 + sub w9, w10, w9 // bitdepth_min_8 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + ldp w6, w7, [x2] // vmask[0], vmask[1] +.ifc \type, y + ldr w2, [x2, #8] // vmask[2] +.endif + add x5, x5, #128 // Move to sharp part of lut +.ifc \type, y + orr w7, w7, w2 // vmask[1] |= vmask[2] +.endif +.ifc \dir, v + sub x4, x3, x4, lsl #2 +.else + sub x3, x3, #4 + lsl x4, x4, #2 +.endif + orr w6, w6, w7 // vmask[0] |= vmask[1] + +1: + tst w6, #0x0f +.ifc \dir, v + ld1 {v0.8b}, [x4], #8 + ld1 {v1.8b}, [x3], #8 +.else + ld2 {v0.s,v1.s}[0], [x3], x4 + ld2 {v0.s,v1.s}[1], [x3], x4 +.endif + b.eq 7f // if (!(vm & bits)) continue; + + ld1r {v5.8b}, [x5] // sharp[0] + add x5, x5, #8 + movi v2.2s, #0xff + dup v13.2s, w6 // vmask[0] + dup v31.8h, w9 // bitdepth_min_8 + + and v0.8b, v0.8b, v2.8b // Keep only lowest byte in each 32 bit word + and v1.8b, v1.8b, v2.8b + cmtst v3.8b, v1.8b, v2.8b // Check for nonzero values in l[0][0] + movi v4.8b, #1 + ld1r {v6.8b}, [x5] // sharp[1] + sub x5, x5, #8 + bif v1.8b, v0.8b, v3.8b // if (!l[0][0]) L = l[offset][0] + mul v1.2s, v1.2s, v4.2s // L +.ifc \type, y + dup v15.2s, w2 // vmask[2] +.endif + cmtst v2.2s, v1.2s, v2.2s // L != 0 + dup v14.2s, w7 // vmask[1] + mov x16, v2.d[0] + cmp x16, #0 + b.eq 7f // if (!L) continue; + neg v5.8b, v5.8b // -sharp[0] + movrel x16, word_12 + ushr v12.8b, v1.8b, #4 // H + ld1 {v16.2s}, [x16] + sshl v3.8b, v1.8b, v5.8b // L >> sharp[0] +.ifc \type, y + cmtst v15.2s, v15.2s, v16.2s // if (vmask[2] & bits) +.endif + movi v7.8b, #2 + umin v3.8b, v3.8b, v6.8b // imin(L >> sharp[0], sharp[1]) + add v0.8b, v1.8b, v7.8b // L + 2 + umax v11.8b, v3.8b, v4.8b // imax(imin(), 1) = limit = I + add v0.8b, v0.8b, v0.8b // 2*(L + 2) + cmtst v14.2s, v14.2s, v16.2s // if (vmask[1] & bits) + uxtl v12.8h, v12.8b + add v10.8b, v0.8b, v11.8b // 2*(L + 2) + limit = E + cmtst v13.2s, v13.2s, v16.2s // if (vmask[0] & bits) + uxtl v11.8h, v11.8b + uxtl v10.8h, v10.8b + and v13.8b, v13.8b, v2.8b // vmask[0] &= L != 0 + sxtl v14.8h, v14.8b + sxtl v13.8h, v13.8b +.ifc \type, y + sxtl v15.8h, v15.8b +.endif + ushl v12.8h, v12.8h, v31.8h + ushl v11.8h, v11.8h, v31.8h + ushl v10.8h, v10.8h, v31.8h + +.ifc \type, y + tst w2, #0x0f + b.eq 2f + // wd16 + bl lpf_\dir\()_16_8_neon + b 8f +2: +.endif + tst w7, #0x0f + b.eq 3f +.ifc \type, y + // wd8 + bl lpf_\dir\()_8_8_neon +.else + // wd6 + bl lpf_\dir\()_6_8_neon +.endif + b 8f +3: + // wd4 + bl lpf_\dir\()_4_8_neon +.ifc \dir, h + b 8f +7: + // For dir h, the functions above increment x0. + // If the whole function is skipped, increment it here instead. + add x0, x0, x1, lsl #3 +.else +7: +.endif +8: + lsr w6, w6, #2 // vmask[0] >>= 2 + lsr w7, w7, #2 // vmask[1] >>= 2 +.ifc \type, y + lsr w2, w2, #2 // vmask[2] >>= 2 +.endif +.ifc \dir, v + add x0, x0, #16 +.else + // For dir h, x0 is returned incremented +.endif + cbnz w6, 1b + + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + br x11 +endfunc +.endm + +lpf_func v, y +lpf_func h, y +lpf_func v, uv +lpf_func h, uv + +const word_12 + .word 1, 2 +endconst diff --git a/src/arm/loopfilter_init_tmpl.c b/src/arm/loopfilter_init_tmpl.c index ae76c09..d44f8e1 100644 --- a/src/arm/loopfilter_init_tmpl.c +++ b/src/arm/loopfilter_init_tmpl.c @@ -38,7 +38,7 @@ COLD void bitfn(dav1d_loop_filter_dsp_init_arm)(Dav1dLoopFilterDSPContext *const if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; -#if BITDEPTH == 8 +#if BITDEPTH == 8 || ARCH_AARCH64 c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, neon); c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, neon); c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, neon); |