arm64: mc: NEON implementation of avg/mask/w_avg for 16 bpc

avg_w4_16bpc_neon: 78.2 43.2 48.9 avg_w8_16bpc_neon: 199.1 108.7 123.1 avg_w16_16bpc_neon: 615.6 339.9 373.9 avg_w32_16bpc_neon: 2313.0 1390.6 1490.6 avg_w64_16bpc_neon: 5783.6 3119.5 3653.0 avg_w128_16bpc_neon: 15444.6 8168.7 8907.9 w_avg_w4_16bpc_neon: 120.1 87.8 92.4 w_avg_w8_16bpc_neon: 321.6 252.4 263.1 w_avg_w16_16bpc_neon: 1017.5 794.5 831.2 w_avg_w32_16bpc_neon: 3911.4 3154.7 3306.5 w_avg_w64_16bpc_neon: 9977.9 7794.9 8022.3 w_avg_w128_16bpc_neon: 25721.5 19274.6 20041.7 mask_w4_16bpc_neon: 139.5 96.5 104.3 mask_w8_16bpc_neon: 376.0 283.9 300.1 mask_w16_16bpc_neon: 1217.2 906.7 950.0 mask_w32_16bpc_neon: 4811.1 3669.0 3901.3 mask_w64_16bpc_neon: 12036.4 8918.4 9244.8 mask_w128_16bpc_neon: 30888.8 21999.0 23206.7 For comparison, these are the corresponding numbers for 8bpc: avg_w4_8bpc_neon: 56.7 26.2 28.5 avg_w8_8bpc_neon: 137.2 52.8 64.3 avg_w16_8bpc_neon: 377.9 151.5 161.6 avg_w32_8bpc_neon: 1528.9 614.5 633.9 avg_w64_8bpc_neon: 3792.5 1814.3 1518.3 avg_w128_8bpc_neon: 10685.3 5220.4 3879.9 w_avg_w4_8bpc_neon: 75.2 53.0 41.1 w_avg_w8_8bpc_neon: 186.7 120.1 105.2 w_avg_w16_8bpc_neon: 531.6 314.1 302.1 w_avg_w32_8bpc_neon: 2138.4 1120.4 1171.5 w_avg_w64_8bpc_neon: 5151.9 2910.5 2857.1 w_avg_w128_8bpc_neon: 13945.0 7330.5 7389.1 mask_w4_8bpc_neon: 82.0 47.2 45.1 mask_w8_8bpc_neon: 213.5 115.4 115.8 mask_w16_8bpc_neon: 639.8 356.2 370.1 mask_w32_8bpc_neon: 2566.9 1489.8 1435.0 mask_w64_8bpc_neon: 6727.6 3822.8 3425.2 mask_w128_8bpc_neon: 17893.0 9622.6 9161.3
author: Martin Storsjö <martin@martin.st> 2020-01-31 23:35:08 +0300
committer: Martin Storsjö <martin@martin.st> 2020-02-04 09:33:26 +0300
commit: 03511f8cc77eba680e3db6f5497691a685aa64ce (patch)
tree: 93f0fd4457d01a4f6dffb9bf3748e5cb09084c70 /src/arm/64
parent: a285204a5df0f5ec32aab326fd188c942160b356 (diff)
1 files changed, 241 insertions, 0 deletions
diff --git a/src/arm/64/mc16.S b/src/arm/64/mc16.S
new file mode 100644
index 0000000..dbd1ccc
--- /dev/null
+++ b/src/arm/64/mc16.S
@@ -0,0 +1,241 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define PREP_BIAS 8192
+
+.macro avg d0, d1, t0, t1, t2, t3
+        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
+        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
+        sqadd           \t0\().8h,  \t0\().8h,  \t2\().8h
+        sqadd           \t1\().8h,  \t1\().8h,  \t3\().8h
+        smax            \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+        smax            \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+        sqsub           \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+        sqsub           \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
+        sshl            \d0\().8h,  \t0\().8h,  v29.8h // -(intermediate_bits+1)
+        sshl            \d1\().8h,  \t1\().8h,  v29.8h // -(intermediate_bits+1)
+.endm
+
+.macro w_avg d0, d1, t0, t1, t2, t3
+        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
+        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
+        // This difference requires a 17 bit range, and all bits are
+        // significant for the following multiplication.
+        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
+        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
+        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
+        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
+        mul             \d0\().4s,  \d0\().4s,  v27.4s
+        mul             \t0\().4s,  \t0\().4s,  v27.4s
+        mul             \d1\().4s,  \d1\().4s,  v27.4s
+        mul             \t1\().4s,  \t1\().4s,  v27.4s
+        sshr            \d0\().4s,  \d0\().4s,  #4
+        sshr            \t0\().4s,  \t0\().4s,  #4
+        sshr            \d1\().4s,  \d1\().4s,  #4
+        sshr            \t1\().4s,  \t1\().4s,  #4
+        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
+        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
+        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
+        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
+        xtn             \d0\().4h,  \d0\().4s
+        xtn2            \d0\().8h,  \t0\().4s
+        xtn             \d1\().4h,  \d1\().4s
+        xtn2            \d1\().8h,  \t1\().4s
+        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
+        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
+        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
+        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
+        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
+        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
+        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
+        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
+.endm
+
+.macro mask d0, d1, t0, t1, t2, t3
+        ld1             {v27.16b}, [x6],  16
+        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
+        neg             v27.16b, v27.16b
+        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
+        sxtl            v26.8h,  v27.8b
+        sxtl2           v27.8h,  v27.16b
+        sxtl            v24.4s,  v26.4h
+        sxtl2           v25.4s,  v26.8h
+        sxtl            v26.4s,  v27.4h
+        sxtl2           v27.4s,  v27.8h
+        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
+        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
+        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
+        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
+        mul             \d0\().4s,  \d0\().4s,  v24.4s
+        mul             \t0\().4s,  \t0\().4s,  v25.4s
+        mul             \d1\().4s,  \d1\().4s,  v26.4s
+        mul             \t1\().4s,  \t1\().4s,  v27.4s
+        sshr            \d0\().4s,  \d0\().4s,  #6
+        sshr            \t0\().4s,  \t0\().4s,  #6
+        sshr            \d1\().4s,  \d1\().4s,  #6
+        sshr            \t1\().4s,  \t1\().4s,  #6
+        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
+        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
+        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
+        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
+        xtn             \d0\().4h,  \d0\().4s
+        xtn2            \d0\().8h,  \t0\().4s
+        xtn             \d1\().4h,  \d1\().4s
+        xtn2            \d1\().8h,  \t1\().4s
+        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
+        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
+        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
+        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
+        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
+        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
+        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
+        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
+.endm
+
+.macro bidir_fn type, bdmax
+function \type\()_16bpc_neon, export=1
+        clz             w4,  w4
+.ifnc \type, avg
+        dup             v31.8h,  \bdmax // bitdepth_max
+        movi            v30.8h,  #0
+.endif
+        clz             w7,  \bdmax
+        sub             w7,  w7,  #18   // intermediate_bits = clz(bitdepth_max) - 18
+.ifc \type, avg
+        mov             w9,  #1
+        mov             w8,  #-2*PREP_BIAS
+        lsl             w9,  w9,  w7    // 1 << intermediate_bits
+        add             w7,  w7,  #1
+        sub             w8,  w8,  w9    // -2*PREP_BIAS - 1 << intermediate_bits
+        neg             w7,  w7         // -(intermediate_bits+1)
+        dup             v28.8h,   w8    // -2*PREP_BIAS - 1 << intermediate_bits
+        dup             v29.8h,   w7    // -(intermediate_bits+1)
+.else
+        mov             w8,  #PREP_BIAS
+        lsr             w8,  w8,  w7    // PREP_BIAS >> intermediate_bits
+        neg             w7,  w7         // -intermediate_bits
+        dup             v28.8h,  w8     // PREP_BIAS >> intermediate_bits
+        dup             v29.8h,  w7     // -intermediate_bits
+.endif
+.ifc \type, w_avg
+        dup             v27.4s,  w6
+        neg             v27.4s,  v27.4s
+.endif
+        adr             x7,  L(\type\()_tbl)
+        sub             w4,  w4,  #24
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        ldrh            w4,  [x7, x4, lsl #1]
+        sub             x7,  x7,  w4, uxtw
+        br              x7
+40:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+4:
+        subs            w5,  w5,  #4
+        st1             {v4.d}[0],  [x0], x1
+        st1             {v4.d}[1],  [x7], x1
+        st1             {v5.d}[0],  [x0], x1
+        st1             {v5.d}[1],  [x7], x1
+        b.le            0f
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        b               4b
+80:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
+8:
+        st1             {v4.8h},  [x0], x1
+        subs            w5,  w5,  #2
+        st1             {v5.8h},  [x7], x1
+        b.le            0f
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        b               8b
+16:
+        \type           v6,  v7,  v0,  v1,  v2,  v3
+        st1             {v4.8h, v5.8h}, [x0], x1
+        subs            w5,  w5,  #2
+        st1             {v6.8h, v7.8h}, [x0], x1
+        b.le            0f
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        b               16b
+32:
+        \type           v6,  v7,  v0,  v1,  v2,  v3
+        subs            w5,  w5,  #1
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
+        b.le            0f
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        b               32b
+640:
+        add             x7,  x0,  #64
+64:
+        \type           v6,  v7,  v0,  v1,  v2,  v3
+        \type           v16, v17, v0,  v1,  v2,  v3
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
+        \type           v18, v19, v0,  v1,  v2,  v3
+        subs            w5,  w5,  #1
+        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
+        b.le            0f
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        b               64b
+1280:
+        add             x7,  x0,  #64
+        mov             x8,  #128
+        sub             x1,  x1,  #128
+128:
+        \type           v6,  v7,  v0,  v1,  v2,  v3
+        \type           v16, v17, v0,  v1,  v2,  v3
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x8
+        \type           v18, v19, v0,  v1,  v2,  v3
+        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        \type           v6,  v7,  v0,  v1,  v2,  v3
+        \type           v16, v17, v0,  v1,  v2,  v3
+        subs            w5,  w5,  #1
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
+        \type           v18, v19, v0,  v1,  v2,  v3
+        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
+        b.le            0f
+        \type           v4,  v5,  v0,  v1,  v2,  v3
+        b               128b
+0:
+        ret
+L(\type\()_tbl):
+        .hword L(\type\()_tbl) - 1280b
+        .hword L(\type\()_tbl) -  640b
+        .hword L(\type\()_tbl) -   32b
+        .hword L(\type\()_tbl) -   16b
+        .hword L(\type\()_tbl) -   80b
+        .hword L(\type\()_tbl) -   40b
+endfunc
+.endm
+
+bidir_fn avg, w6
+bidir_fn w_avg, w7
+bidir_fn mask, w7
author	Martin Storsjö <martin@martin.st>	2020-01-31 23:35:08 +0300
committer	Martin Storsjö <martin@martin.st>	2020-02-04 09:33:26 +0300
commit	03511f8cc77eba680e3db6f5497691a685aa64ce (patch)
tree	93f0fd4457d01a4f6dffb9bf3748e5cb09084c70 /src/arm/64
parent	a285204a5df0f5ec32aab326fd188c942160b356 (diff)