Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJanne Grunau <janne-vlc@jannau.net>2018-09-29 15:49:43 +0300
committerJanne Grunau <janne-vlc@jannau.net>2018-10-21 00:58:17 +0300
commit80e47425e6ca9834e19930ce9663009e421f314c (patch)
treea262156e3a2b394dbd83bdb065f85cdb4c6ebed6 /src/arm/64/mc.S
parent1400b028cf4e24eaeab74f036f04ea8fa79240fa (diff)
arm64/mc: add 8-bit neon asm for avg, w_avg and mask
checkasm --bench on a Qualcomm Kryo (Sanpdragon 820): nop: 33.0 avg_w4_8bpc_c: 450.5 avg_w4_8bpc_neon: 20.1 avg_w8_8bpc_c: 438.6 avg_w8_8bpc_neon: 45.2 avg_w16_8bpc_c: 1003.7 avg_w16_8bpc_neon: 112.8 avg_w32_8bpc_c: 3249.6 avg_w32_8bpc_neon: 429.9 avg_w64_8bpc_c: 7213.3 avg_w64_8bpc_neon: 1299.4 avg_w128_8bpc_c: 16791.3 avg_w128_8bpc_neon: 2978.4 w_avg_w4_8bpc_c: 605.7 w_avg_w4_8bpc_neon: 30.9 w_avg_w8_8bpc_c: 545.8 w_avg_w8_8bpc_neon: 72.9 w_avg_w16_8bpc_c: 1430.1 w_avg_w16_8bpc_neon: 193.5 w_avg_w32_8bpc_c: 4876.3 w_avg_w32_8bpc_neon: 715.3 w_avg_w64_8bpc_c: 11338.0 w_avg_w64_8bpc_neon: 2147.0 w_avg_w128_8bpc_c: 26822.0 w_avg_w128_8bpc_neon: 4596.3 mask_w4_8bpc_c: 604.6 mask_w4_8bpc_neon: 37.2 mask_w8_8bpc_c: 654.8 mask_w8_8bpc_neon: 96.0 mask_w16_8bpc_c: 1663.0 mask_w16_8bpc_neon: 272.4 mask_w32_8bpc_c: 5707.6 mask_w32_8bpc_neon: 1028.9 mask_w64_8bpc_c: 12735.3 mask_w64_8bpc_neon: 2533.2 mask_w128_8bpc_c: 31027.6 mask_w128_8bpc_neon: 6247.2
Diffstat (limited to 'src/arm/64/mc.S')
-rw-r--r--src/arm/64/mc.S237
1 files changed, 237 insertions, 0 deletions
diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
new file mode 100644
index 0000000..a9d2fe3
--- /dev/null
+++ b/src/arm/64/mc.S
@@ -0,0 +1,237 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+#if BITDEPTH == 8
+
+.macro avg dst, t0, t1
+ ld1 {\t0\().8h}, [x2], 16
+ ld1 {\t1\().8h}, [x3], 16
+ add \t0\().8h, \t0\().8h, \t1\().8h
+ sqrshrun \dst\().8b, \t0\().8h, #5
+.endm
+
+.macro avg16 dst, t0, t1, t2, t3
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ add \t0\().8h, \t0\().8h, \t2\().8h
+ add \t1\().8h, \t1\().8h, \t3\().8h
+ sqrshrun \dst\().8b, \t0\().8h, #5
+ sqrshrun2 \dst\().16b, \t1\().8h, #5
+.endm
+
+.macro w_avg dst, t0, t1
+ ld1 {\t0\().8h}, [x2], 16
+ ld1 {\t1\().8h}, [x3], 16
+ sub \t0\().8h, \t1\().8h, \t0\().8h
+ sqdmulh \t0\().8h, \t0\().8h, v30.8h
+ add \t0\().8h, \t1\().8h, \t0\().8h
+ sqrshrun \dst\().8b, \t0\().8h, #4
+.endm
+
+.macro w_avg16 dst, t0, t1, t2, t3
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ sub \t0\().8h, \t2\().8h, \t0\().8h
+ sub \t1\().8h, \t3\().8h, \t1\().8h
+ sqdmulh \t0\().8h, \t0\().8h, v30.8h
+ sqdmulh \t1\().8h, \t1\().8h, v30.8h
+ add \t0\().8h, \t2\().8h, \t0\().8h
+ add \t1\().8h, \t3\().8h, \t1\().8h
+ sqrshrun \dst\().8b, \t0\().8h, #4
+ sqrshrun2 \dst\().16b, \t1\().8h, #4
+.endm
+
+.macro mask dst, t0, t1
+ ld1 {v30.8b}, [x6], 8
+ ld1 {\t0\().8h}, [x2], 16
+ mul v30.8b, v30.8b, v31.8b
+ ld1 {\t1\().8h}, [x3], 16
+ shll v30.8h, v30.8b, #8
+ sub \t0\().8h, \t1\().8h, \t0\().8h
+ sqdmulh \t0\().8h, \t0\().8h, v30.8h
+ add \t0\().8h, \t1\().8h, \t0\().8h
+ sqrshrun \dst\().8b, \t0\().8h, #4
+.endm
+
+.macro mask16 dst, t0, t1, t2, t3
+ ld1 {v30.16b}, [x6], 16
+ ld1 {\t0\().8h,\t1\().8h}, [x2], 32
+ mul v30.16b, v30.16b, v31.16b
+ ld1 {\t2\().8h,\t3\().8h}, [x3], 32
+ shll v28.8h, v30.8b, #8
+ shll2 v29.8h, v30.16b, #8
+ sub \t0\().8h, \t2\().8h, \t0\().8h
+ sub \t1\().8h, \t3\().8h, \t1\().8h
+ sqdmulh \t0\().8h, \t0\().8h, v28.8h
+ sqdmulh \t1\().8h, \t1\().8h, v29.8h
+ add \t0\().8h, \t2\().8h, \t0\().8h
+ add \t1\().8h, \t3\().8h, \t1\().8h
+ sqrshrun \dst\().8b, \t0\().8h, #4
+ sqrshrun2 \dst\().16b, \t1\().8h, #4
+.endm
+
+.macro bidir_fn type
+function \type\()_8bpc_neon, export=1
+.ifc \type, w_avg
+ dup v30.8h, w6
+ neg v30.8h, v30.8h
+ shl v30.8h, v30.8h, #11
+.endif
+.ifc \type, mask
+ movi v31.16b, #256-2
+.endif
+ rbit w4, w4
+ adr x7, \type\()_tbl
+ clz w4, w4
+ \type v4, v0, v1
+ ldrh w4, [x7, x4, lsl #1]
+ \type v5, v2, v3
+ sub x7, x7, w4, uxth
+ br x7
+4:
+ cmp w5, #4
+ st1 {v4.s}[0], [x0], x1
+ st1 {v4.s}[1], [x0], x1
+ st1 {v5.s}[0], [x0], x1
+ st1 {v5.s}[1], [x0], x1
+ b.eq 0f
+ \type v6, v0, v1
+ \type v7, v2, v3
+ cmp w5, #8
+ st1 {v6.s}[0], [x0], x1
+ st1 {v6.s}[1], [x0], x1
+ st1 {v7.s}[0], [x0], x1
+ st1 {v7.s}[1], [x0], x1
+ b.eq 0f
+ \type v4, v0, v1
+ \type v5, v2, v3
+ st1 {v4.s}[0], [x0], x1
+ st1 {v4.s}[1], [x0], x1
+ \type v6, v0, v1
+ st1 {v5.s}[0], [x0], x1
+ st1 {v5.s}[1], [x0], x1
+ \type v7, v2, v3
+ st1 {v6.s}[0], [x0], x1
+ st1 {v6.s}[1], [x0], x1
+ st1 {v7.s}[0], [x0], x1
+ st1 {v7.s}[1], [x0], x1
+ ret
+8:
+ st1 {v4.8b}, [x0], x1
+ \type v6, v0, v1
+ st1 {v5.8b}, [x0], x1
+ \type v7, v0, v1
+ st1 {v6.8b}, [x0], x1
+ subs w5, w5, #4
+ st1 {v7.8b}, [x0], x1
+ b.le 0f
+ \type v4, v0, v1
+ \type v5, v2, v3
+ b 8b
+160:
+ trn1 v4.2d, v4.2d, v5.2d
+16:
+ \type\()16 v5, v0, v1, v2, v3
+ st1 {v4.16b}, [x0], x1
+ \type\()16 v6, v0, v1, v2, v3
+ st1 {v5.16b}, [x0], x1
+ \type\()16 v7, v0, v1, v2, v3
+ st1 {v6.16b}, [x0], x1
+ subs w5, w5, #4
+ st1 {v7.16b}, [x0], x1
+ b.le 0f
+ \type\()16 v4, v0, v1, v2, v3
+ b 16b
+320:
+ trn1 v4.2d, v4.2d, v5.2d
+ add x7, x0, x1
+ lsl x1, x1, #1
+32:
+ \type\()16 v5, v0, v1, v2, v3
+ \type\()16 v6, v0, v1, v2, v3
+ st1 {v4.16b,v5.16b}, [x0], x1
+ \type\()16 v7, v0, v1, v2, v3
+ subs w5, w5, #2
+ st1 {v6.16b,v7.16b}, [x7], x1
+ b.le 0f
+ \type\()16 v4, v0, v1, v2, v3
+ b 32b
+640:
+ trn1 v4.2d, v4.2d, v5.2d
+ add x7, x0, x1
+ lsl x1, x1, #1
+64:
+ \type\()16 v5, v0, v1, v2, v3
+ \type\()16 v6, v0, v1, v2, v3
+ \type\()16 v7, v0, v1, v2, v3
+ \type\()16 v16, v0, v1, v2, v3
+ \type\()16 v17, v0, v1, v2, v3
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
+ \type\()16 v18, v0, v1, v2, v3
+ \type\()16 v19, v0, v1, v2, v3
+ subs w5, w5, #2
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
+ b.le 0f
+ \type\()16 v4, v0, v1, v2, v3
+ b 64b
+1280:
+ trn1 v4.2d, v4.2d, v5.2d
+ add x7, x0, #64
+128:
+ \type\()16 v5, v0, v1, v2, v3
+ \type\()16 v6, v0, v1, v2, v3
+ \type\()16 v7, v0, v1, v2, v3
+ \type\()16 v16, v0, v1, v2, v3
+ \type\()16 v17, v0, v1, v2, v3
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
+ \type\()16 v18, v0, v1, v2, v3
+ \type\()16 v19, v0, v1, v2, v3
+ subs w5, w5, #1
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
+ b.le 0f
+ \type\()16 v4, v0, v1, v2, v3
+ b 128b
+0:
+ ret
+\type\()_tbl:
+ .hword 0, 0
+ .hword \type\()_tbl - 4b
+ .hword \type\()_tbl - 8b
+ .hword \type\()_tbl - 160b
+ .hword \type\()_tbl - 320b
+ .hword \type\()_tbl - 640b
+ .hword \type\()_tbl - 1280b
+endfunc
+.endm
+
+bidir_fn avg
+bidir_fn w_avg
+bidir_fn mask
+
+#endif /* BITDEPTH == 8 */