Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/src/arm
diff options
context:
space:
mode:
authorVictorien Le Couviour--Tuffet <victorien@videolan.org>2021-10-11 15:57:11 +0300
committerVictorien Le Couviour--Tuffet <victorien@videolan.org>2021-10-29 23:18:20 +0300
commitf7e0d4c032dad31a46129c9eaf30ad5c2f704d2b (patch)
treeedcea1c0cc1e534ff17fa31ba9c9c40154dbdfbf /src/arm
parent609fbaba84b8e15a25de9efbc8b14988be6df94d (diff)
Remove lpf_stride parameter from LR filters
Diffstat (limited to 'src/arm')
-rw-r--r--src/arm/64/looprestoration.S200
-rw-r--r--src/arm/64/looprestoration16.S222
-rw-r--r--src/arm/looprestoration_init_tmpl.c72
3 files changed, 231 insertions, 263 deletions
diff --git a/src/arm/64/looprestoration.S b/src/arm/64/looprestoration.S
index 35e71b8..778448a 100644
--- a/src/arm/64/looprestoration.S
+++ b/src/arm/64/looprestoration.S
@@ -44,18 +44,16 @@ right_ext_mask:
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
endconst
-// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t p_stride,
-// const pixel (*left)[4],
-// const pixel *lpf, const ptrdiff_t lpf_stride,
+// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t stride,
+// const pixel (*left)[4], const pixel *lpf,
// const int w, int h,
// const int16_t filter[2][8],
// const enum LrEdgeFlags edges);
function wiener_filter7_8bpc_neon, export=1
- ldr w8, [sp]
stp x29, x30, [sp, #-16]!
mov x29, sp
- ld1 {v0.8h, v1.8h}, [x7]
- tst w8, #4 // LR_HAVE_TOP
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
sub_sp 384*2*6
mov w17, #(1 << 14) - (1 << 2)
@@ -75,50 +73,48 @@ function wiener_filter7_8bpc_neon, export=1
mov x16, x2 // backup left
mov x2, #0
bl wiener_filter7_h_8bpc_neon
- add x3, x3, x4 // lpf += lpf_stride
+ add x3, x3, x1 // lpf += stride
mov x9, x14 // t6
mov x10, x14 // t5
add x14, x14, #384*2 // t1 += 384*2
bl wiener_filter7_h_8bpc_neon
- add x3, x3, x4, lsl #2
- add x3, x3, x4 // lpf += lpf_stride*5
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
mov x11, x14 // t4
add x14, x14, #384*2 // t1 += 384*2
mov x2, x16 // left
mov x16, x3 // backup lpf
mov x3, x0 // lpf = p
bl wiener_filter7_h_8bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
mov x12, x14 // t3
mov x13, x14 // t2
b.eq L(v1_7)
- add x3, x3, x1 // src += p_stride
+ add x3, x3, x1 // src += stride
add x14, x14, #384*2 // t1 += 384*2
bl wiener_filter7_h_8bpc_neon
mov x13, x14 // t2
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.eq L(v2_7)
- add x3, x3, x1 // src += p_stride
+ add x3, x3, x1 // src += stride
add x14, x14, #384*2 // t1 += 384*2
bl wiener_filter7_h_8bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.eq L(v3_7)
- add x3, x3, x1 // src += p_stride
+ add x3, x3, x1 // src += stride
L(main_7):
add x15, x14, #384*2 // t0 = t1 + 384*2
L(main_loop_7):
bl wiener_filter7_hv_8bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.ne L(main_loop_7)
- tst w8, #8 // LR_HAVE_BOTTOM
+ tst w7, #8 // LR_HAVE_BOTTOM
b.eq L(v3_7)
mov x3, x16 // restore lpf
mov x2, #0 // left = NULL
- sub x4, x4, x1 // lpf_stride - p_stride
bl wiener_filter7_hv_8bpc_neon
- add x3, x3, x4 // src += lpf_stride - p_stride
bl wiener_filter7_hv_8bpc_neon
L(v1_7):
bl wiener_filter7_v_8bpc_neon
@@ -128,37 +124,37 @@ L(v1_7):
ret
L(no_top_7):
- add x3, x3, x4, lsl #2
- add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
mov x3, x0 // lpf = p
bl wiener_filter7_h_8bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
mov x9, x14 // t6
mov x10, x14 // t5
mov x11, x14 // t4
mov x12, x14 // t3
mov x13, x14 // t2
b.eq L(v1_7)
- add x3, x3, x1 // src += p_stride
+ add x3, x3, x1 // src += stride
add x14, x14, #384*2 // t1 += 384*2
bl wiener_filter7_h_8bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
mov x13, x14 // t2
b.eq L(v2_7)
- add x3, x3, x1 // src += p_stride
+ add x3, x3, x1 // src += stride
add x14, x14, #384*2 // t1 += 384*2
bl wiener_filter7_h_8bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.eq L(v3_7)
- add x3, x3, x1 // src += p_stride
+ add x3, x3, x1 // src += stride
add x15, x14, #384*2 // t0 = t1 + 384*2
bl wiener_filter7_hv_8bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.eq L(v3_7)
add x15, x15, #384*2*4 // t0 += 384*2*4
bl wiener_filter7_hv_8bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.ne L(main_7)
L(v3_7):
bl wiener_filter7_v_8bpc_neon
@@ -169,11 +165,11 @@ endfunc
function wiener_filter7_h_8bpc_neon
- stp x3, x5, [sp, #-32]!
+ stp x3, x4, [sp, #-32]!
str x14, [sp, #16]
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
- tst w8, #1 // LR_HAVE_LEFT
+ tst w7, #1 // LR_HAVE_LEFT
b.eq 1f
// LR_HAVE_LEFT
cbnz x2, 0f
@@ -208,13 +204,13 @@ function wiener_filter7_h_8bpc_neon
uxtl2 v3.8h, v3.16b
uxtl v4.8h, v4.8b
- tst w8, #2 // LR_HAVE_RIGHT
+ tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
- cmp w5, #19
+ cmp w4, #19
b.ge 4f // If w >= 19, all used input pixels are valid
// 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
@@ -223,14 +219,14 @@ function wiener_filter7_h_8bpc_neon
// The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
// v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
- sub w17, w5, #22
+ sub w17, w4, #22
// Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
- movrel x7, right_ext_mask, -6
+ movrel x6, right_ext_mask, -6
ldr b28, [x3, w17, sxtw]
- sub x7, x7, w5, uxtw #1
+ sub x6, x6, w4, uxtw #1
dup v28.8h, v28.h[0]
- ld1 {v25.16b, v26.16b, v27.16b}, [x7]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
bit v2.16b, v28.16b, v25.16b
bit v3.16b, v28.16b, v26.16b
@@ -280,14 +276,14 @@ function wiener_filter7_h_8bpc_neon
add v6.8h, v6.8h, v31.8h
add v7.8h, v7.8h, v31.8h
- subs w5, w5, #16
+ subs w4, w4, #16
st1 {v6.8h, v7.8h}, [x14], #32
b.le 0f
mov v2.16b, v4.16b
ld1 {v4.16b}, [x3], #16
- tst w8, #2 // LR_HAVE_RIGHT
+ tst w7, #2 // LR_HAVE_RIGHT
uxtl v3.8h, v4.8b
uxtl2 v4.8h, v4.16b
b.ne 4b // If we don't need to pad, just keep filtering.
@@ -295,7 +291,7 @@ function wiener_filter7_h_8bpc_neon
0:
ldr x14, [sp, #16]
- ldp x3, x5, [sp], #32
+ ldp x3, x4, [sp], #32
ret
endfunc
@@ -305,7 +301,7 @@ function wiener_filter7_v_8bpc_neon
stp x10, x11, [sp, #-64]!
stp x12, x13, [sp, #16]
stp x14, x14, [sp, #32]
- stp x0, x5, [sp, #48]
+ stp x0, x4, [sp, #48]
1:
ld1 {v20.8h, v21.8h}, [x11], #32
ld1 {v24.8h, v25.8h}, [x13], #32
@@ -345,11 +341,11 @@ function wiener_filter7_v_8bpc_neon
sqrshrun2 v3.8h, v5.4s, #11
sqxtun v2.8b, v2.8h
sqxtun2 v2.16b, v3.8h
- subs w5, w5, #16
+ subs w4, w4, #16
st1 {v2.16b}, [x0], #16
b.gt 1b
- ldp x0, x5, [sp, #48]
+ ldp x0, x4, [sp, #48]
ldp x13, x14, [sp, #32]
ldp x11, x12, [sp, #16]
ldp x9, x10, [sp], #64
@@ -365,10 +361,10 @@ function wiener_filter7_hv_8bpc_neon
stp x12, x13, [sp, #16]
stp x14, x15, [sp, #32]
stp x10, x0, [sp, #48]
- stp x3, x5, [sp, #64]
+ stp x3, x4, [sp, #64]
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
- tst w8, #1 // LR_HAVE_LEFT
+ tst w7, #1 // LR_HAVE_LEFT
b.eq 1f
// LR_HAVE_LEFT
cbnz x2, 0f
@@ -402,13 +398,13 @@ function wiener_filter7_hv_8bpc_neon
uxtl2 v3.8h, v3.16b
uxtl v4.8h, v4.8b
- tst w8, #2 // LR_HAVE_RIGHT
+ tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
- cmp w5, #19
+ cmp w4, #19
b.ge 4f // If w >= 19, all used input pixels are valid
// 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
@@ -417,14 +413,14 @@ function wiener_filter7_hv_8bpc_neon
// The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
// v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
- sub w17, w5, #22
+ sub w17, w4, #22
// Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
- movrel x7, right_ext_mask, -6
+ movrel x6, right_ext_mask, -6
ldr b28, [x3, w17, sxtw]
- sub x7, x7, w5, uxtw #1
+ sub x6, x6, w4, uxtw #1
dup v28.8h, v28.h[0]
- ld1 {v25.16b, v26.16b, v27.16b}, [x7]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
bit v2.16b, v28.16b, v25.16b
bit v3.16b, v28.16b, v26.16b
@@ -510,21 +506,21 @@ function wiener_filter7_hv_8bpc_neon
st1 {v6.8h, v7.8h}, [x15], #32
sqxtun v18.8b, v18.8h
sqxtun2 v18.16b, v19.8h
- subs w5, w5, #16
+ subs w4, w4, #16
st1 {v18.16b}, [x0], #16
b.le 0f
mov v2.16b, v4.16b
ld1 {v4.16b}, [x3], #16
- tst w8, #2 // LR_HAVE_RIGHT
+ tst w7, #2 // LR_HAVE_RIGHT
uxtl v3.8h, v4.8b
uxtl2 v4.8h, v4.16b
b.ne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
0:
- ldp x3, x5, [sp, #64]
+ ldp x3, x4, [sp, #64]
ldp x15, x0, [sp, #48]
ldp x13, x14, [sp, #32]
ldp x11, x12, [sp, #16]
@@ -536,18 +532,16 @@ function wiener_filter7_hv_8bpc_neon
ret
endfunc
-// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t p_stride,
-// const pixel (*left)[4],
-// const pixel *lpf, const ptrdiff_t lpf_stride,
+// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t stride,
+// const pixel (*left)[4], const pixel *lpf,
// const int w, int h,
// const int16_t filter[2][8],
// const enum LrEdgeFlags edges);
function wiener_filter5_8bpc_neon, export=1
- ldr w8, [sp]
stp x29, x30, [sp, #-16]!
mov x29, sp
- ld1 {v0.8h, v1.8h}, [x7]
- tst w8, #4 // LR_HAVE_TOP
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
sub_sp 384*2*4
mov w17, #(1 << 14) - (1 << 2)
@@ -565,42 +559,40 @@ function wiener_filter5_8bpc_neon, export=1
mov x16, x2 // backup left
mov x2, #0
bl wiener_filter5_h_8bpc_neon
- add x3, x3, x4 // lpf += lpf_stride
+ add x3, x3, x1 // lpf += stride
mov x11, x14 // t4
add x14, x14, #384*2 // t1 += 384*2
bl wiener_filter5_h_8bpc_neon
- add x3, x3, x4, lsl #2
- add x3, x3, x4 // lpf += lpf_stride*5
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
mov x12, x14 // t3
add x14, x14, #384*2 // t1 += 384*2
mov x2, x16 // left
mov x16, x3 // backup lpf
mov x3, x0 // lpf = p
bl wiener_filter5_h_8bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
mov x13, x14 // t2
b.eq L(v1_5)
- add x3, x3, x1 // src += p_stride
+ add x3, x3, x1 // src += stride
add x14, x14, #384*2 // t1 += 384*2
bl wiener_filter5_h_8bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.eq L(v2_5)
- add x3, x3, x1 // src += p_stride
+ add x3, x3, x1 // src += stride
L(main_5):
mov x15, x11 // t0 = t4
L(main_loop_5):
bl wiener_filter5_hv_8bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.ne L(main_loop_5)
- tst w8, #8 // LR_HAVE_BOTTOM
+ tst w7, #8 // LR_HAVE_BOTTOM
b.eq L(v2_5)
mov x3, x16 // restore lpf
mov x2, #0 // left = NULL
- sub x4, x4, x1 // lpf_stride - p_stride
bl wiener_filter5_hv_8bpc_neon
- add x3, x3, x4 // src += lpf_stride - p_stride
bl wiener_filter5_hv_8bpc_neon
L(end_5):
@@ -609,29 +601,29 @@ L(end_5):
ret
L(no_top_5):
- add x3, x3, x4, lsl #2
- add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
mov x3, x0 // lpf = p
bl wiener_filter5_h_8bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
mov x11, x14 // t4
mov x12, x14 // t3
mov x13, x14 // t2
b.eq L(v1_5)
- add x3, x3, x1 // src += p_stride
+ add x3, x3, x1 // src += stride
add x14, x14, #384*2 // t1 += 384*2
bl wiener_filter5_h_8bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.eq L(v2_5)
- add x3, x3, x1 // src += p_stride
+ add x3, x3, x1 // src += stride
add x15, x14, #384*2 // t0 = t1 + 384*2
bl wiener_filter5_hv_8bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.eq L(v2_5)
add x15, x15, #384*2*3 // t0 += 384*2*3
bl wiener_filter5_hv_8bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.ne L(main_5)
L(v2_5):
bl wiener_filter5_v_8bpc_neon
@@ -646,11 +638,11 @@ endfunc
function wiener_filter5_h_8bpc_neon
- stp x3, x5, [sp, #-32]!
+ stp x3, x4, [sp, #-32]!
str x14, [sp, #16]
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
- tst w8, #1 // LR_HAVE_LEFT
+ tst w7, #1 // LR_HAVE_LEFT
b.eq 1f
// LR_HAVE_LEFT
cbnz x2, 0f
@@ -685,13 +677,13 @@ function wiener_filter5_h_8bpc_neon
uxtl2 v3.8h, v3.16b
uxtl v4.8h, v4.8b
- tst w8, #2 // LR_HAVE_RIGHT
+ tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
- cmp w5, #18
+ cmp w4, #18
b.ge 4f // If w >= 18, all used input pixels are valid
// 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
@@ -700,14 +692,14 @@ function wiener_filter5_h_8bpc_neon
// The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
// v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
- sub w17, w5, #23
+ sub w17, w4, #23
// Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
// buffer pointer.
- movrel x7, right_ext_mask, -4
+ movrel x6, right_ext_mask, -4
ldr b28, [x3, w17, sxtw]
- sub x7, x7, w5, uxtw #1
+ sub x6, x6, w4, uxtw #1
dup v28.8h, v28.h[0]
- ld1 {v25.16b, v26.16b, v27.16b}, [x7]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
bit v2.16b, v28.16b, v25.16b
bit v3.16b, v28.16b, v26.16b
@@ -748,14 +740,14 @@ function wiener_filter5_h_8bpc_neon
add v6.8h, v6.8h, v31.8h
add v7.8h, v7.8h, v31.8h
- subs w5, w5, #16
+ subs w4, w4, #16
st1 {v6.8h, v7.8h}, [x14], #32
b.le 0f
mov v2.16b, v4.16b
ld1 {v4.16b}, [x3], #16
- tst w8, #2 // LR_HAVE_RIGHT
+ tst w7, #2 // LR_HAVE_RIGHT
uxtl v3.8h, v4.8b
uxtl2 v4.8h, v4.16b
b.ne 4b // If we don't need to pad, just keep filtering.
@@ -763,14 +755,14 @@ function wiener_filter5_h_8bpc_neon
0:
ldr x14, [sp, #16]
- ldp x3, x5, [sp], #32
+ ldp x3, x4, [sp], #32
ret
endfunc
function wiener_filter5_v_8bpc_neon
stp x11, x12, [sp, #-48]!
stp x13, x14, [sp, #16]
- stp x0, x5, [sp, #32]
+ stp x0, x4, [sp, #32]
1:
ld1 {v18.8h, v19.8h}, [x12], #32
ld1 {v22.8h, v23.8h}, [x14], #32
@@ -800,11 +792,11 @@ function wiener_filter5_v_8bpc_neon
sqrshrun2 v3.8h, v5.4s, #11
sqxtun v2.8b, v2.8h
sqxtun2 v2.16b, v3.8h
- subs w5, w5, #16
+ subs w4, w4, #16
st1 {v2.16b}, [x0], #16
b.gt 1b
- ldp x0, x5, [sp, #32]
+ ldp x0, x4, [sp, #32]
ldp x13, x14, [sp, #16]
ldp x11, x12, [sp], #48
@@ -817,10 +809,10 @@ function wiener_filter5_hv_8bpc_neon
stp x12, x13, [sp, #-64]!
stp x14, x15, [sp, #16]
stp x12, x0, [sp, #32]
- stp x3, x5, [sp, #48]
+ stp x3, x4, [sp, #48]
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
- tst w8, #1 // LR_HAVE_LEFT
+ tst w7, #1 // LR_HAVE_LEFT
b.eq 1f
// LR_HAVE_LEFT
cbnz x2, 0f
@@ -854,13 +846,13 @@ function wiener_filter5_hv_8bpc_neon
uxtl2 v3.8h, v3.16b
uxtl v4.8h, v4.8b
- tst w8, #2 // LR_HAVE_RIGHT
+ tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
- cmp w5, #18
+ cmp w4, #18
b.ge 4f // If w >= 18, all used input pixels are valid
// 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
@@ -869,14 +861,14 @@ function wiener_filter5_hv_8bpc_neon
// The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
// v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
- sub w17, w5, #23
+ sub w17, w4, #23
// Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
// buffer pointer.
- movrel x7, right_ext_mask, -4
+ movrel x6, right_ext_mask, -4
ldr b28, [x3, w17, sxtw]
- sub x7, x7, w5, uxtw #1
+ sub x6, x6, w4, uxtw #1
dup v28.8h, v28.h[0]
- ld1 {v25.16b, v26.16b, v27.16b}, [x7]
+ ld1 {v25.16b, v26.16b, v27.16b}, [x6]
bit v2.16b, v28.16b, v25.16b
bit v3.16b, v28.16b, v26.16b
@@ -944,21 +936,21 @@ function wiener_filter5_hv_8bpc_neon
st1 {v6.8h, v7.8h}, [x15], #32
sqxtun v18.8b, v18.8h
sqxtun2 v18.16b, v19.8h
- subs w5, w5, #16
+ subs w4, w4, #16
st1 {v18.16b}, [x0], #16
b.le 0f
mov v2.16b, v4.16b
ld1 {v4.16b}, [x3], #16
- tst w8, #2 // LR_HAVE_RIGHT
+ tst w7, #2 // LR_HAVE_RIGHT
uxtl v3.8h, v4.8b
uxtl2 v4.8h, v4.16b
b.ne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
0:
- ldp x3, x5, [sp, #48]
+ ldp x3, x4, [sp, #48]
ldp x15, x0, [sp, #32]
ldp x13, x14, [sp, #16]
ldp x11, x12, [sp], #64
diff --git a/src/arm/64/looprestoration16.S b/src/arm/64/looprestoration16.S
index 69efa4e..fcb4f84 100644
--- a/src/arm/64/looprestoration16.S
+++ b/src/arm/64/looprestoration16.S
@@ -45,36 +45,30 @@ right_ext_mask:
endconst
// void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride,
-// const pixel (*left)[4],
-// const pixel *lpf, const ptrdiff_t lpf_stride,
+// const pixel (*left)[4], const pixel *lpf,
// const int w, int h,
// const int16_t filter[2][8],
// const enum LrEdgeFlags edges,
// const int bitdepth_max);
function wiener_filter7_16bpc_neon, export=1
ldr w8, [sp]
-#ifdef __APPLE__
- ldr w9, [sp, #4]
-#else
- ldr w9, [sp, #8]
-#endif
stp x29, x30, [sp, #-32]!
stp d8, d9, [sp, #16]
mov x29, sp
- ld1 {v0.8h, v1.8h}, [x7]
- tst w8, #4 // LR_HAVE_TOP
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
sub_sp 384*2*6
- dup v28.8h, w9 // bitdepth_max
- clz w9, w9
+ dup v28.8h, w8 // bitdepth_max
+ clz w8, w8
movi v30.4s, #1
- sub w10, w9, #38 // -(bitdepth + 6)
- sub w11, w9, #11 // round_bits_v
- sub w9, w9, #25 // -round_bits_h
+ sub w10, w8, #38 // -(bitdepth + 6)
+ sub w11, w8, #11 // round_bits_v
+ sub w8, w8, #25 // -round_bits_h
neg w10, w10 // bitdepth + 6
neg w11, w11 // -round_bits_v
dup v2.4s, w10
- dup v29.4s, w9 // -round_bits_h
+ dup v29.4s, w8 // -round_bits_h
dup v27.4s, w11 // -round_bits_v
movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192
ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6)
@@ -94,50 +88,48 @@ function wiener_filter7_16bpc_neon, export=1
mov x16, x2 // backup left
mov x2, #0
bl wiener_filter7_h_16bpc_neon
- add x3, x3, x4 // lpf += lpf_stride
+ add x3, x3, x1 // lpf += stride
mov x9, x14 // t6
mov x10, x14 // t5
add x14, x14, #384*2 // t1 += 384*2
bl wiener_filter7_h_16bpc_neon
- add x3, x3, x4, lsl #2
- add x3, x3, x4 // lpf += lpf_stride*5
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
mov x11, x14 // t4
add x14, x14, #384*2 // t1 += 384*2
mov x2, x16 // left
mov x16, x3 // backup lpf
mov x3, x0 // lpf = p
bl wiener_filter7_h_16bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
mov x12, x14 // t3
mov x13, x14 // t2
b.eq L(v1_7)
- add x3, x3, x1 // src += p_stride
+ add x3, x3, x1 // src += stride
add x14, x14, #384*2 // t1 += 384*2
bl wiener_filter7_h_16bpc_neon
mov x13, x14 // t2
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.eq L(v2_7)
- add x3, x3, x1 // src += p_stride
+ add x3, x3, x1 // src += stride
add x14, x14, #384*2 // t1 += 384*2
bl wiener_filter7_h_16bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.eq L(v3_7)
- add x3, x3, x1 // src += p_stride
+ add x3, x3, x1 // src += stride
L(main_7):
add x15, x14, #384*2 // t0 = t1 + 384*2
L(main_loop_7):
bl wiener_filter7_hv_16bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.ne L(main_loop_7)
- tst w8, #8 // LR_HAVE_BOTTOM
+ tst w7, #8 // LR_HAVE_BOTTOM
b.eq L(v3_7)
mov x3, x16 // restore lpf
mov x2, #0 // left = NULL
- sub x4, x4, x1 // lpf_stride - p_stride
bl wiener_filter7_hv_16bpc_neon
- add x3, x3, x4 // src += lpf_stride - p_stride
bl wiener_filter7_hv_16bpc_neon
L(v1_7):
bl wiener_filter7_v_16bpc_neon
@@ -148,12 +140,12 @@ L(v1_7):
ret
L(no_top_7):
- add x3, x3, x4, lsl #2
- add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
mov x3, x0 // lpf = p
bl wiener_filter7_h_16bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
mov x9, x14 // t6
mov x10, x14 // t5
mov x11, x14 // t4
@@ -163,22 +155,22 @@ L(no_top_7):
add x3, x3, x1 // src += p_stride
add x14, x14, #384*2 // t1 += 384*2
bl wiener_filter7_h_16bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
mov x13, x14 // t2
b.eq L(v2_7)
add x3, x3, x1 // src += p_stride
add x14, x14, #384*2 // t1 += 384*2
bl wiener_filter7_h_16bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.eq L(v3_7)
add x3, x3, x1 // src += p_stride
add x15, x14, #384*2 // t0 = t1 + 384*2
bl wiener_filter7_hv_16bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.eq L(v3_7)
add x15, x15, #384*2*4 // t0 += 384*2*4
bl wiener_filter7_hv_16bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.ne L(main_7)
L(v3_7):
bl wiener_filter7_v_16bpc_neon
@@ -189,11 +181,11 @@ endfunc
function wiener_filter7_h_16bpc_neon
- stp x3, x5, [sp, #-32]!
+ stp x3, x4, [sp, #-32]!
str x14, [sp, #16]
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
- tst w8, #1 // LR_HAVE_LEFT
+ tst w7, #1 // LR_HAVE_LEFT
b.eq 1f
// LR_HAVE_LEFT
cbnz x2, 0f
@@ -227,13 +219,13 @@ function wiener_filter7_h_16bpc_neon
2:
ld1 {v4.8h}, [x3], #16
- tst w8, #2 // LR_HAVE_RIGHT
+ tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
- cmp w5, #19
+ cmp w4, #19
b.ge 4f // If w >= 19, all used input pixels are valid
// 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
@@ -242,14 +234,14 @@ function wiener_filter7_h_16bpc_neon
// The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
// v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
- sub w17, w5, #22
+ sub w17, w4, #22
// Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
- movrel x7, right_ext_mask, -6
+ movrel x6, right_ext_mask, -6
ldr h26, [x3, w17, sxtw #1]
- sub x7, x7, w5, uxtw #1
+ sub x6, x6, w4, uxtw #1
dup v26.8h, v26.h[0]
- ld1 {v23.16b, v24.16b, v25.16b}, [x7]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
bit v2.16b, v26.16b, v23.16b
bit v3.16b, v26.16b, v24.16b
@@ -314,20 +306,20 @@ function wiener_filter7_h_16bpc_neon
sub v6.8h, v6.8h, v31.8h
sub v7.8h, v7.8h, v31.8h
- subs w5, w5, #16
+ subs w4, w4, #16
st1 {v6.8h, v7.8h}, [x14], #32
b.le 0f
mov v2.16b, v4.16b
- tst w8, #2 // LR_HAVE_RIGHT
+ tst w7, #2 // LR_HAVE_RIGHT
ld1 {v3.8h, v4.8h}, [x3], #32
b.ne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
0:
ldr x14, [sp, #16]
- ldp x3, x5, [sp], #32
+ ldp x3, x4, [sp], #32
ret
endfunc
@@ -337,7 +329,7 @@ function wiener_filter7_v_16bpc_neon
stp x10, x11, [sp, #-64]!
stp x12, x13, [sp, #16]
stp x14, x14, [sp, #32]
- stp x0, x5, [sp, #48]
+ stp x0, x4, [sp, #48]
1:
ld1 {v16.8h, v17.8h}, [x9], #32
ld1 {v18.8h, v19.8h}, [x10], #32
@@ -384,11 +376,11 @@ function wiener_filter7_v_16bpc_neon
sqxtun2 v3.8h, v5.4s
umin v2.8h, v2.8h, v28.8h // bitdepth_max
umin v3.8h, v3.8h, v28.8h
- subs w5, w5, #16
+ subs w4, w4, #16
st1 {v2.8h, v3.8h}, [x0], #32
b.gt 1b
- ldp x0, x5, [sp, #48]
+ ldp x0, x4, [sp, #48]
ldp x13, x14, [sp, #32]
ldp x11, x12, [sp, #16]
ldp x9, x10, [sp], #64
@@ -404,10 +396,10 @@ function wiener_filter7_hv_16bpc_neon
stp x12, x13, [sp, #16]
stp x14, x15, [sp, #32]
stp x10, x0, [sp, #48]
- stp x3, x5, [sp, #64]
+ stp x3, x4, [sp, #64]
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
- tst w8, #1 // LR_HAVE_LEFT
+ tst w7, #1 // LR_HAVE_LEFT
b.eq 1f
// LR_HAVE_LEFT
cbnz x2, 0f
@@ -440,13 +432,13 @@ function wiener_filter7_hv_16bpc_neon
2:
ld1 {v4.8h}, [x3], #16
- tst w8, #2 // LR_HAVE_RIGHT
+ tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
- cmp w5, #19
+ cmp w4, #19
b.ge 4f // If w >= 19, all used input pixels are valid
// 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
@@ -455,14 +447,14 @@ function wiener_filter7_hv_16bpc_neon
// The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
// v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
- sub w17, w5, #22
+ sub w17, w4, #22
// Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
- movrel x7, right_ext_mask, -6
+ movrel x6, right_ext_mask, -6
ldr h26, [x3, w17, sxtw #1]
- sub x7, x7, w5, uxtw #1
+ sub x6, x6, w4, uxtw #1
dup v26.8h, v26.h[0]
- ld1 {v23.16b, v24.16b, v25.16b}, [x7]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
bit v2.16b, v26.16b, v23.16b
bit v3.16b, v26.16b, v24.16b
@@ -571,19 +563,19 @@ function wiener_filter7_hv_16bpc_neon
st1 {v6.8h, v7.8h}, [x15], #32
umin v18.8h, v18.8h, v28.8h // bitdepth_max
umin v19.8h, v19.8h, v28.8h
- subs w5, w5, #16
+ subs w4, w4, #16
st1 {v18.8h, v19.8h}, [x0], #32
b.le 0f
mov v2.16b, v4.16b
- tst w8, #2 // LR_HAVE_RIGHT
+ tst w7, #2 // LR_HAVE_RIGHT
ld1 {v3.8h, v4.8h}, [x3], #32
b.ne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
0:
- ldp x3, x5, [sp, #64]
+ ldp x3, x4, [sp, #64]
ldp x15, x0, [sp, #48]
ldp x13, x14, [sp, #32]
ldp x11, x12, [sp, #16]
@@ -596,36 +588,30 @@ function wiener_filter7_hv_16bpc_neon
endfunc
// void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride,
-// const pixel (*left)[4],
-// const pixel *lpf, const ptrdiff_t lpf_stride,
+// const pixel (*left)[4], const pixel *lpf,
// const int w, int h,
// const int16_t filter[2][8],
// const enum LrEdgeFlags edges,
// const int bitdepth_max);
function wiener_filter5_16bpc_neon, export=1
ldr w8, [sp]
-#ifdef __APPLE__
- ldr w9, [sp, #4]
-#else
- ldr w9, [sp, #8]
-#endif
stp x29, x30, [sp, #-32]!
stp d8, d9, [sp, #16]
mov x29, sp
- ld1 {v0.8h, v1.8h}, [x7]
- tst w8, #4 // LR_HAVE_TOP
+ ld1 {v0.8h, v1.8h}, [x6]
+ tst w7, #4 // LR_HAVE_TOP
sub_sp 384*2*4
- dup v28.8h, w9 // bitdepth_max
- clz w9, w9
+ dup v28.8h, w8 // bitdepth_max
+ clz w8, w8
movi v30.4s, #1
- sub w10, w9, #38 // -(bitdepth + 6)
- sub w11, w9, #11 // round_bits_v
- sub w9, w9, #25 // -round_bits_h
+ sub w10, w8, #38 // -(bitdepth + 6)
+ sub w11, w8, #11 // round_bits_v
+ sub w8, w8, #25 // -round_bits_h
neg w10, w10 // bitdepth + 6
neg w11, w11 // -round_bits_v
dup v2.4s, w10
- dup v29.4s, w9 // -round_bits_h
+ dup v29.4s, w8 // -round_bits_h
dup v27.4s, w11 // -round_bits_v
movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192
ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6)
@@ -643,42 +629,40 @@ function wiener_filter5_16bpc_neon, export=1
mov x16, x2 // backup left
mov x2, #0
bl wiener_filter5_h_16bpc_neon
- add x3, x3, x4 // lpf += lpf_stride
+ add x3, x3, x1 // lpf += stride
mov x11, x14 // t4
add x14, x14, #384*2 // t1 += 384*2
bl wiener_filter5_h_16bpc_neon
- add x3, x3, x4, lsl #2
- add x3, x3, x4 // lpf += lpf_stride*5
+ add x3, x3, x1, lsl #2
+ add x3, x3, x1 // lpf += stride*5
mov x12, x14 // t3
add x14, x14, #384*2 // t1 += 384*2
mov x2, x16 // left
mov x16, x3 // backup lpf
mov x3, x0 // lpf = p
bl wiener_filter5_h_16bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
mov x13, x14 // t2
b.eq L(v1_5)
- add x3, x3, x1 // src += p_stride
+ add x3, x3, x1 // src += stride
add x14, x14, #384*2 // t1 += 384*2
bl wiener_filter5_h_16bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.eq L(v2_5)
- add x3, x3, x1 // src += p_stride
+ add x3, x3, x1 // src += stride
L(main_5):
mov x15, x11 // t0 = t4
L(main_loop_5):
bl wiener_filter5_hv_16bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.ne L(main_loop_5)
- tst w8, #8 // LR_HAVE_BOTTOM
+ tst w7, #8 // LR_HAVE_BOTTOM
b.eq L(v2_5)
mov x3, x16 // restore lpf
mov x2, #0 // left = NULL
- sub x4, x4, x1 // lpf_stride - p_stride
bl wiener_filter5_hv_16bpc_neon
- add x3, x3, x4 // src += lpf_stride - p_stride
bl wiener_filter5_hv_16bpc_neon
L(end_5):
@@ -688,29 +672,29 @@ L(end_5):
ret
L(no_top_5):
- add x3, x3, x4, lsl #2
- add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup
+ add x3, x3, x1, lsl #2
+ add x16, x3, x1, lsl #1 // lpf += stride*6, backup
mov x3, x0 // lpf = p
bl wiener_filter5_h_16bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
mov x11, x14 // t4
mov x12, x14 // t3
mov x13, x14 // t2
b.eq L(v1_5)
- add x3, x3, x1 // src += p_stride
+ add x3, x3, x1 // src += stride
add x14, x14, #384*2 // t1 += 384*2
bl wiener_filter5_h_16bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.eq L(v2_5)
- add x3, x3, x1 // src += p_stride
+ add x3, x3, x1 // src += stride
add x15, x14, #384*2 // t0 = t1 + 384*2
bl wiener_filter5_hv_16bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.eq L(v2_5)
add x15, x15, #384*2*3 // t0 += 384*2*3
bl wiener_filter5_hv_16bpc_neon
- subs w6, w6, #1 // h--
+ subs w5, w5, #1 // h--
b.ne L(main_5)
L(v2_5):
bl wiener_filter5_v_16bpc_neon
@@ -725,11 +709,11 @@ endfunc
function wiener_filter5_h_16bpc_neon
- stp x3, x5, [sp, #-32]!
+ stp x3, x4, [sp, #-32]!
str x14, [sp, #16]
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
- tst w8, #1 // LR_HAVE_LEFT
+ tst w7, #1 // LR_HAVE_LEFT
b.eq 1f
// LR_HAVE_LEFT
cbnz x2, 0f
@@ -763,13 +747,13 @@ function wiener_filter5_h_16bpc_neon
2:
ld1 {v4.8h}, [x3], #16
- tst w8, #2 // LR_HAVE_RIGHT
+ tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
- cmp w5, #18
+ cmp w4, #18
b.ge 4f // If w >= 18, all used input pixels are valid
// 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
@@ -778,14 +762,14 @@ function wiener_filter5_h_16bpc_neon
// The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
// v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
- sub w17, w5, #23
+ sub w17, w4, #23
// Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
// buffer pointer.
- movrel x7, right_ext_mask, -4
+ movrel x6, right_ext_mask, -4
ldr h26, [x3, w17, sxtw #1]
- sub x7, x7, w5, uxtw #1
+ sub x6, x6, w4, uxtw #1
dup v26.8h, v26.h[0]
- ld1 {v23.16b, v24.16b, v25.16b}, [x7]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
bit v2.16b, v26.16b, v23.16b
bit v3.16b, v26.16b, v24.16b
@@ -839,27 +823,27 @@ function wiener_filter5_h_16bpc_neon
sub v6.8h, v6.8h, v31.8h
sub v7.8h, v7.8h, v31.8h
- subs w5, w5, #16
+ subs w4, w4, #16
st1 {v6.8h, v7.8h}, [x14], #32
b.le 0f
mov v2.16b, v4.16b
- tst w8, #2 // LR_HAVE_RIGHT
+ tst w7, #2 // LR_HAVE_RIGHT
ld1 {v3.8h, v4.8h}, [x3], #32
b.ne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
0:
ldr x14, [sp, #16]
- ldp x3, x5, [sp], #32
+ ldp x3, x4, [sp], #32
ret
endfunc
function wiener_filter5_v_16bpc_neon
stp x11, x12, [sp, #-48]!
stp x13, x14, [sp, #16]
- stp x0, x5, [sp, #32]
+ stp x0, x4, [sp, #32]
1:
ld1 {v16.8h, v17.8h}, [x11], #32
ld1 {v18.8h, v19.8h}, [x12], #32
@@ -897,11 +881,11 @@ function wiener_filter5_v_16bpc_neon
umin v2.8h, v2.8h, v28.8h // bitdepth_max
umin v3.8h, v3.8h, v28.8h
- subs w5, w5, #16
+ subs w4, w4, #16
st1 {v2.8h, v3.8h}, [x0], #32
b.gt 1b
- ldp x0, x5, [sp, #32]
+ ldp x0, x4, [sp, #32]
ldp x13, x14, [sp, #16]
ldp x11, x12, [sp], #48
@@ -914,10 +898,10 @@ function wiener_filter5_hv_16bpc_neon
stp x12, x13, [sp, #-64]!
stp x14, x15, [sp, #16]
stp x12, x0, [sp, #32]
- stp x3, x5, [sp, #48]
+ stp x3, x4, [sp, #48]
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
- tst w8, #1 // LR_HAVE_LEFT
+ tst w7, #1 // LR_HAVE_LEFT
b.eq 1f
// LR_HAVE_LEFT
cbnz x2, 0f
@@ -950,13 +934,13 @@ function wiener_filter5_hv_16bpc_neon
2:
ld1 {v4.8h}, [x3], #16
- tst w8, #2 // LR_HAVE_RIGHT
+ tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
- cmp w5, #18
+ cmp w4, #18
b.ge 4f // If w >= 18, all used input pixels are valid
// 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
@@ -965,14 +949,14 @@ function wiener_filter5_hv_16bpc_neon
// The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
// v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
- sub w17, w5, #23
+ sub w17, w4, #23
// Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
// buffer pointer.
- movrel x7, right_ext_mask, -4
+ movrel x6, right_ext_mask, -4
ldr h26, [x3, w17, sxtw #1]
- sub x7, x7, w5, uxtw #1
+ sub x6, x6, w4, uxtw #1
dup v26.8h, v26.h[0]
- ld1 {v23.16b, v24.16b, v25.16b}, [x7]
+ ld1 {v23.16b, v24.16b, v25.16b}, [x6]
bit v2.16b, v26.16b, v23.16b
bit v3.16b, v26.16b, v24.16b
@@ -1059,19 +1043,19 @@ function wiener_filter5_hv_16bpc_neon
umin v8.8h, v8.8h, v28.8h // bitdepth_max
umin v9.8h, v9.8h, v28.8h
- subs w5, w5, #16
+ subs w4, w4, #16
st1 {v8.8h, v9.8h}, [x0], #32
b.le 0f
mov v2.16b, v4.16b
- tst w8, #2 // LR_HAVE_RIGHT
+ tst w7, #2 // LR_HAVE_RIGHT
ld1 {v3.8h, v4.8h}, [x3], #32
b.ne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
0:
- ldp x3, x5, [sp, #48]
+ ldp x3, x4, [sp, #48]
ldp x15, x0, [sp, #32]
ldp x13, x14, [sp, #16]
ldp x11, x12, [sp], #64
diff --git a/src/arm/looprestoration_init_tmpl.c b/src/arm/looprestoration_init_tmpl.c
index 61584e6..5ba4bce 100644
--- a/src/arm/looprestoration_init_tmpl.c
+++ b/src/arm/looprestoration_init_tmpl.c
@@ -29,16 +29,14 @@
#include "src/looprestoration.h"
#if ARCH_AARCH64
-void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t p_stride,
- const pixel (*left)[4],
- const pixel *lpf, const ptrdiff_t lpf_stride,
+void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
const int w, int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX);
-void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t p_stride,
- const pixel (*left)[4],
- const pixel *lpf, const ptrdiff_t lpf_stride,
+void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t stride,
+ const pixel (*left)[4], const pixel *lpf,
const int w, int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges
@@ -76,9 +74,8 @@ void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
const int16_t fv[8], enum LrEdgeFlags edges,
ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
-static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
- const pixel (*const left)[4],
- const pixel *lpf, const ptrdiff_t lpf_stride,
+static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
@@ -88,20 +85,20 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
int mid_stride = (w + 7) & ~7;
// Horizontal filter
- BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, dst_stride,
+ BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, stride,
filter[0], w, h, edges HIGHBD_TAIL_SUFFIX);
if (edges & LR_HAVE_TOP)
- BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, lpf_stride,
+ BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, stride,
filter[0], w, 2, edges
HIGHBD_TAIL_SUFFIX);
if (edges & LR_HAVE_BOTTOM)
BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL,
- lpf + 6 * PXSTRIDE(lpf_stride),
- lpf_stride, filter[0], w, 2, edges
+ lpf + 6 * PXSTRIDE(stride),
+ stride, filter[0], w, 2, edges
HIGHBD_TAIL_SUFFIX);
// Vertical filter
- BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride],
+ BF(dav1d_wiener_filter_v, neon)(dst, stride, &mid[2*mid_stride],
w, h, filter[1], edges,
mid_stride * sizeof(*mid)
HIGHBD_TAIL_SUFFIX);
@@ -127,8 +124,7 @@ void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp,
/* filter with a 3x3 box (radius=1) */
static void dav1d_sgr_filter1_neon(int16_t *tmp,
const pixel *src, const ptrdiff_t stride,
- const pixel (*left)[4],
- const pixel *lpf, const ptrdiff_t lpf_stride,
+ const pixel (*left)[4], const pixel *lpf,
const int w, const int h, const int strength,
const enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX)
@@ -141,12 +137,12 @@ static void dav1d_sgr_filter1_neon(int16_t *tmp,
BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
if (edges & LR_HAVE_TOP)
BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
- NULL, lpf, lpf_stride, w, 2, edges);
+ NULL, lpf, stride, w, 2, edges);
if (edges & LR_HAVE_BOTTOM)
BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
- NULL, lpf + 6 * PXSTRIDE(lpf_stride),
- lpf_stride, w, 2, edges);
+ NULL, lpf + 6 * PXSTRIDE(stride),
+ stride, w, 2, edges);
dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX);
@@ -172,8 +168,7 @@ void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp,
/* filter with a 5x5 box (radius=2) */
static void dav1d_sgr_filter2_neon(int16_t *tmp,
const pixel *src, const ptrdiff_t stride,
- const pixel (*left)[4],
- const pixel *lpf, const ptrdiff_t lpf_stride,
+ const pixel (*left)[4], const pixel *lpf,
const int w, const int h, const int strength,
const enum LrEdgeFlags edges
HIGHBD_DECL_SUFFIX)
@@ -186,12 +181,12 @@ static void dav1d_sgr_filter2_neon(int16_t *tmp,
BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
if (edges & LR_HAVE_TOP)
BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
- NULL, lpf, lpf_stride, w, 2, edges);
+ NULL, lpf, stride, w, 2, edges);
if (edges & LR_HAVE_BOTTOM)
BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
- NULL, lpf + 6 * PXSTRIDE(lpf_stride),
- lpf_stride, w, 2, edges);
+ NULL, lpf + 6 * PXSTRIDE(stride),
+ stride, w, 2, edges);
dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX);
@@ -208,49 +203,46 @@ void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
const int w, const int h,
const int16_t wt[2] HIGHBD_DECL_SUFFIX);
-static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t dst_stride,
- const pixel (*const left)[4],
- const pixel *lpf, const ptrdiff_t lpf_stride,
+static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(int16_t, tmp, 64 * 384,);
- dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
+ dav1d_sgr_filter2_neon(tmp, dst, stride, left, lpf,
w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
- BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
+ BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
}
-static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t dst_stride,
- const pixel (*const left)[4],
- const pixel *lpf, const ptrdiff_t lpf_stride,
+static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(int16_t, tmp, 64 * 384,);
- dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
+ dav1d_sgr_filter1_neon(tmp, dst, stride, left, lpf,
w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
- BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
+ BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
}
-static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t dst_stride,
- const pixel (*const left)[4],
- const pixel *lpf, const ptrdiff_t lpf_stride,
+static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride,
+ const pixel (*const left)[4], const pixel *lpf,
const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
- dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
+ dav1d_sgr_filter2_neon(tmp1, dst, stride, left, lpf,
w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
- dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,
+ dav1d_sgr_filter1_neon(tmp2, dst, stride, left, lpf,
w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 };
- BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride,
+ BF(dav1d_sgr_weighted2, neon)(dst, stride, dst, stride,
tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX);
}