Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/src/arm/64
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2020-01-28 12:07:14 +0300
committerMartin Storsjö <martin@martin.st>2020-01-29 15:51:55 +0300
commit361a3c8ee2d03f87f42a76213ee0f93e49fa9ec3 (patch)
tree52322335158061ab5c153f256406e65891d92dd5 /src/arm/64
parent6ad9bd5f92621d81a227b6d271c29dfaa578000a (diff)
arm: cdef: Add special cased versions for pri_strength/sec_strength being zero
Before: ARM32: Cortex A7 A8 A9 A53 A72 A73 cdef_filter_4x4_8bpc_neon: 964.6 599.5 707.9 601.2 465.1 405.2 cdef_filter_4x8_8bpc_neon: 1726.0 1066.2 1238.7 1041.7 798.6 725.3 cdef_filter_8x8_8bpc_neon: 2974.4 1671.8 1943.9 1806.1 1229.8 1242.1 ARM64: cdef_filter_4x4_8bpc_neon: 569.2 337.8 348.7 cdef_filter_4x8_8bpc_neon: 1031.1 623.3 633.6 cdef_filter_8x8_8bpc_neon: 1847.5 1097.7 1117.5 After: ARM32: Cortex A7 A8 A9 A53 A72 A73 cdef_filter_4x4_8bpc_neon: 798.4 524.2 617.3 506.8 432.4 361.1 cdef_filter_4x8_8bpc_neon: 1394.7 910.4 1054.0 863.6 730.2 632.2 cdef_filter_8x8_8bpc_neon: 2364.6 1453.8 1675.1 1466.0 1086.4 1107.7 ARM64: cdef_filter_4x4_8bpc_neon: 461.7 303.1 308.6 cdef_filter_4x8_8bpc_neon: 833.0 547.5 556.0 cdef_filter_8x8_8bpc_neon: 1459.3 934.1 967.9
Diffstat (limited to 'src/arm/64')
-rw-r--r--src/arm/64/cdef.S61
1 files changed, 53 insertions, 8 deletions
diff --git a/src/arm/64/cdef.S b/src/arm/64/cdef.S
index b116239..c67af05 100644
--- a/src/arm/64/cdef.S
+++ b/src/arm/64/cdef.S
@@ -286,13 +286,13 @@ endconst
ld1 {\d2\().d}[1], [x9] // p1
.endif
.endm
-.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
+.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap, min
+.if \min
umin v2.8h, v2.8h, \s1\().8h
smax v3.8h, v3.8h, \s1\().8h
umin v2.8h, v2.8h, \s2\().8h
smax v3.8h, v3.8h, \s2\().8h
-
- cbz \threshold, 3f
+.endif
uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
@@ -316,25 +316,35 @@ endconst
// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping, int h);
-.macro filter w
-function cdef_filter\w\()_neon, export=1
+.macro filter_func w, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_neon
+.if \pri
movrel x8, pri_taps
and w9, w3, #1
add x8, x8, w9, uxtw #1
+.endif
movrel x9, directions\w
add x5, x9, w5, uxtw #1
movi v30.4h, #15
dup v28.4h, w6 // damping
+.if \pri
dup v25.8h, w3 // threshold
+.endif
+.if \sec
dup v27.8h, w4 // threshold
+.endif
trn1 v24.4h, v25.4h, v27.4h
clz v24.4h, v24.4h // clz(threshold)
sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
neg v24.4h, v24.4h // -shift
+.if \sec
dup v26.8h, v24.h[1]
+.endif
+.if \pri
dup v24.8h, v24.h[0]
+.endif
1:
.if \w == 8
@@ -346,45 +356,62 @@ function cdef_filter\w\()_neon, export=1
.endif
movi v1.8h, #0 // sum
+.if \min
mov v2.16b, v0.16b // min
mov v3.16b, v0.16b // max
+.endif
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
+ // This is also used as loop counter.
mov w11, #2 // sec_taps[0]
2:
+.if \pri
ldrb w9, [x5] // off1
load_px v4, v5, \w
+.endif
+.if \sec
add x5, x5, #4 // +2*2
ldrb w9, [x5] // off2
load_px v6, v7, \w
+.endif
+.if \pri
ldrb w10, [x8] // *pri_taps
- handle_pixel v4, v5, w3, v25.8h, v24.8h, w10
+ handle_pixel v4, v5, w3, v25.8h, v24.8h, w10, \min
+.endif
+.if \sec
add x5, x5, #8 // +2*4
ldrb w9, [x5] // off3
load_px v4, v5, \w
- handle_pixel v6, v7, w4, v27.8h, v26.8h, w11
+ handle_pixel v6, v7, w4, v27.8h, v26.8h, w11, \min
- handle_pixel v4, v5, w4, v27.8h, v26.8h, w11
+ handle_pixel v4, v5, w4, v27.8h, v26.8h, w11, \min
sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
+.else
+ add x5, x5, #1 // x5 += 1
+.endif
subs w11, w11, #1 // sec_tap-- (value)
+.if \pri
add x8, x8, #1 // pri_taps++ (pointer)
+.endif
b.ne 2b
sshr v4.8h, v1.8h, #15 // -(sum < 0)
add v1.8h, v1.8h, v4.8h // sum - (sum < 0)
srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4
+.if \min
smin v0.8h, v0.8h, v3.8h
smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max)
+.endif
xtn v0.8b, v0.8h
.if \w == 8
add x2, x2, #2*16 // tmp += tmp_stride
@@ -399,13 +426,31 @@ function cdef_filter\w\()_neon, export=1
// Reset pri_taps and directions back to the original point
sub x5, x5, #2
+.if \pri
sub x8, x8, #2
+.endif
b.gt 1b
ret
endfunc
.endm
+.macro filter w
+filter_func \w, pri=1, sec=0, min=0, suffix=_pri
+filter_func \w, pri=0, sec=1, min=0, suffix=_sec
+filter_func \w, pri=1, sec=1, min=1, suffix=_pri_sec
+
+function cdef_filter\w\()_neon, export=1
+ cbnz w3, 1f // pri_strength
+ b cdef_filter\w\()_sec_neon // only sec
+1:
+ cbnz w4, 1f // sec_strength
+ b cdef_filter\w\()_pri_neon // only pri
+1:
+ b cdef_filter\w\()_pri_sec_neon // both pri and sec
+endfunc
+.endm
+
filter 8
filter 4