diff options
author | Victorien Le Couviour--Tuffet <victorien@videolan.org> | 2021-10-11 15:57:11 +0300 |
---|---|---|
committer | Victorien Le Couviour--Tuffet <victorien@videolan.org> | 2021-10-29 23:18:20 +0300 |
commit | f7e0d4c032dad31a46129c9eaf30ad5c2f704d2b (patch) | |
tree | edcea1c0cc1e534ff17fa31ba9c9c40154dbdfbf /src/arm | |
parent | 609fbaba84b8e15a25de9efbc8b14988be6df94d (diff) |
Remove lpf_stride parameter from LR filters
Diffstat (limited to 'src/arm')
-rw-r--r-- | src/arm/64/looprestoration.S | 200 | ||||
-rw-r--r-- | src/arm/64/looprestoration16.S | 222 | ||||
-rw-r--r-- | src/arm/looprestoration_init_tmpl.c | 72 |
3 files changed, 231 insertions, 263 deletions
diff --git a/src/arm/64/looprestoration.S b/src/arm/64/looprestoration.S index 35e71b8..778448a 100644 --- a/src/arm/64/looprestoration.S +++ b/src/arm/64/looprestoration.S @@ -44,18 +44,16 @@ right_ext_mask: .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff endconst -// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t p_stride, -// const pixel (*left)[4], -// const pixel *lpf, const ptrdiff_t lpf_stride, +// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t stride, +// const pixel (*left)[4], const pixel *lpf, // const int w, int h, // const int16_t filter[2][8], // const enum LrEdgeFlags edges); function wiener_filter7_8bpc_neon, export=1 - ldr w8, [sp] stp x29, x30, [sp, #-16]! mov x29, sp - ld1 {v0.8h, v1.8h}, [x7] - tst w8, #4 // LR_HAVE_TOP + ld1 {v0.8h, v1.8h}, [x6] + tst w7, #4 // LR_HAVE_TOP sub_sp 384*2*6 mov w17, #(1 << 14) - (1 << 2) @@ -75,50 +73,48 @@ function wiener_filter7_8bpc_neon, export=1 mov x16, x2 // backup left mov x2, #0 bl wiener_filter7_h_8bpc_neon - add x3, x3, x4 // lpf += lpf_stride + add x3, x3, x1 // lpf += stride mov x9, x14 // t6 mov x10, x14 // t5 add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_8bpc_neon - add x3, x3, x4, lsl #2 - add x3, x3, x4 // lpf += lpf_stride*5 + add x3, x3, x1, lsl #2 + add x3, x3, x1 // lpf += stride*5 mov x11, x14 // t4 add x14, x14, #384*2 // t1 += 384*2 mov x2, x16 // left mov x16, x3 // backup lpf mov x3, x0 // lpf = p bl wiener_filter7_h_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x12, x14 // t3 mov x13, x14 // t2 b.eq L(v1_7) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_8bpc_neon mov x13, x14 // t2 - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v2_7) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v3_7) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride L(main_7): add x15, x14, #384*2 // t0 = t1 + 384*2 L(main_loop_7): bl wiener_filter7_hv_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.ne L(main_loop_7) - tst w8, #8 // LR_HAVE_BOTTOM + tst w7, #8 // LR_HAVE_BOTTOM b.eq L(v3_7) mov x3, x16 // restore lpf mov x2, #0 // left = NULL - sub x4, x4, x1 // lpf_stride - p_stride bl wiener_filter7_hv_8bpc_neon - add x3, x3, x4 // src += lpf_stride - p_stride bl wiener_filter7_hv_8bpc_neon L(v1_7): bl wiener_filter7_v_8bpc_neon @@ -128,37 +124,37 @@ L(v1_7): ret L(no_top_7): - add x3, x3, x4, lsl #2 - add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup + add x3, x3, x1, lsl #2 + add x16, x3, x1, lsl #1 // lpf += stride*6, backup mov x3, x0 // lpf = p bl wiener_filter7_h_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x9, x14 // t6 mov x10, x14 // t5 mov x11, x14 // t4 mov x12, x14 // t3 mov x13, x14 // t2 b.eq L(v1_7) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x13, x14 // t2 b.eq L(v2_7) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v3_7) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x15, x14, #384*2 // t0 = t1 + 384*2 bl wiener_filter7_hv_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v3_7) add x15, x15, #384*2*4 // t0 += 384*2*4 bl wiener_filter7_hv_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.ne L(main_7) L(v3_7): bl wiener_filter7_v_8bpc_neon @@ -169,11 +165,11 @@ endfunc function wiener_filter7_h_8bpc_neon - stp x3, x5, [sp, #-32]! + stp x3, x4, [sp, #-32]! str x14, [sp, #16] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w8, #1 // LR_HAVE_LEFT + tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f @@ -208,13 +204,13 @@ function wiener_filter7_h_8bpc_neon uxtl2 v3.8h, v3.16b uxtl v4.8h, v4.8b - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #19 + cmp w4, #19 b.ge 4f // If w >= 19, all used input pixels are valid // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, @@ -223,14 +219,14 @@ function wiener_filter7_h_8bpc_neon // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. - sub w17, w5, #22 + sub w17, w4, #22 // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the // buffer pointer. - movrel x7, right_ext_mask, -6 + movrel x6, right_ext_mask, -6 ldr b28, [x3, w17, sxtw] - sub x7, x7, w5, uxtw #1 + sub x6, x6, w4, uxtw #1 dup v28.8h, v28.h[0] - ld1 {v25.16b, v26.16b, v27.16b}, [x7] + ld1 {v25.16b, v26.16b, v27.16b}, [x6] bit v2.16b, v28.16b, v25.16b bit v3.16b, v28.16b, v26.16b @@ -280,14 +276,14 @@ function wiener_filter7_h_8bpc_neon add v6.8h, v6.8h, v31.8h add v7.8h, v7.8h, v31.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v6.8h, v7.8h}, [x14], #32 b.le 0f mov v2.16b, v4.16b ld1 {v4.16b}, [x3], #16 - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT uxtl v3.8h, v4.8b uxtl2 v4.8h, v4.16b b.ne 4b // If we don't need to pad, just keep filtering. @@ -295,7 +291,7 @@ function wiener_filter7_h_8bpc_neon 0: ldr x14, [sp, #16] - ldp x3, x5, [sp], #32 + ldp x3, x4, [sp], #32 ret endfunc @@ -305,7 +301,7 @@ function wiener_filter7_v_8bpc_neon stp x10, x11, [sp, #-64]! stp x12, x13, [sp, #16] stp x14, x14, [sp, #32] - stp x0, x5, [sp, #48] + stp x0, x4, [sp, #48] 1: ld1 {v20.8h, v21.8h}, [x11], #32 ld1 {v24.8h, v25.8h}, [x13], #32 @@ -345,11 +341,11 @@ function wiener_filter7_v_8bpc_neon sqrshrun2 v3.8h, v5.4s, #11 sqxtun v2.8b, v2.8h sqxtun2 v2.16b, v3.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v2.16b}, [x0], #16 b.gt 1b - ldp x0, x5, [sp, #48] + ldp x0, x4, [sp, #48] ldp x13, x14, [sp, #32] ldp x11, x12, [sp, #16] ldp x9, x10, [sp], #64 @@ -365,10 +361,10 @@ function wiener_filter7_hv_8bpc_neon stp x12, x13, [sp, #16] stp x14, x15, [sp, #32] stp x10, x0, [sp, #48] - stp x3, x5, [sp, #64] + stp x3, x4, [sp, #64] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w8, #1 // LR_HAVE_LEFT + tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f @@ -402,13 +398,13 @@ function wiener_filter7_hv_8bpc_neon uxtl2 v3.8h, v3.16b uxtl v4.8h, v4.8b - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #19 + cmp w4, #19 b.ge 4f // If w >= 19, all used input pixels are valid // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, @@ -417,14 +413,14 @@ function wiener_filter7_hv_8bpc_neon // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. - sub w17, w5, #22 + sub w17, w4, #22 // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the // buffer pointer. - movrel x7, right_ext_mask, -6 + movrel x6, right_ext_mask, -6 ldr b28, [x3, w17, sxtw] - sub x7, x7, w5, uxtw #1 + sub x6, x6, w4, uxtw #1 dup v28.8h, v28.h[0] - ld1 {v25.16b, v26.16b, v27.16b}, [x7] + ld1 {v25.16b, v26.16b, v27.16b}, [x6] bit v2.16b, v28.16b, v25.16b bit v3.16b, v28.16b, v26.16b @@ -510,21 +506,21 @@ function wiener_filter7_hv_8bpc_neon st1 {v6.8h, v7.8h}, [x15], #32 sqxtun v18.8b, v18.8h sqxtun2 v18.16b, v19.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v18.16b}, [x0], #16 b.le 0f mov v2.16b, v4.16b ld1 {v4.16b}, [x3], #16 - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT uxtl v3.8h, v4.8b uxtl2 v4.8h, v4.16b b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: - ldp x3, x5, [sp, #64] + ldp x3, x4, [sp, #64] ldp x15, x0, [sp, #48] ldp x13, x14, [sp, #32] ldp x11, x12, [sp, #16] @@ -536,18 +532,16 @@ function wiener_filter7_hv_8bpc_neon ret endfunc -// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t p_stride, -// const pixel (*left)[4], -// const pixel *lpf, const ptrdiff_t lpf_stride, +// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t stride, +// const pixel (*left)[4], const pixel *lpf, // const int w, int h, // const int16_t filter[2][8], // const enum LrEdgeFlags edges); function wiener_filter5_8bpc_neon, export=1 - ldr w8, [sp] stp x29, x30, [sp, #-16]! mov x29, sp - ld1 {v0.8h, v1.8h}, [x7] - tst w8, #4 // LR_HAVE_TOP + ld1 {v0.8h, v1.8h}, [x6] + tst w7, #4 // LR_HAVE_TOP sub_sp 384*2*4 mov w17, #(1 << 14) - (1 << 2) @@ -565,42 +559,40 @@ function wiener_filter5_8bpc_neon, export=1 mov x16, x2 // backup left mov x2, #0 bl wiener_filter5_h_8bpc_neon - add x3, x3, x4 // lpf += lpf_stride + add x3, x3, x1 // lpf += stride mov x11, x14 // t4 add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter5_h_8bpc_neon - add x3, x3, x4, lsl #2 - add x3, x3, x4 // lpf += lpf_stride*5 + add x3, x3, x1, lsl #2 + add x3, x3, x1 // lpf += stride*5 mov x12, x14 // t3 add x14, x14, #384*2 // t1 += 384*2 mov x2, x16 // left mov x16, x3 // backup lpf mov x3, x0 // lpf = p bl wiener_filter5_h_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x13, x14 // t2 b.eq L(v1_5) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter5_h_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v2_5) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride L(main_5): mov x15, x11 // t0 = t4 L(main_loop_5): bl wiener_filter5_hv_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.ne L(main_loop_5) - tst w8, #8 // LR_HAVE_BOTTOM + tst w7, #8 // LR_HAVE_BOTTOM b.eq L(v2_5) mov x3, x16 // restore lpf mov x2, #0 // left = NULL - sub x4, x4, x1 // lpf_stride - p_stride bl wiener_filter5_hv_8bpc_neon - add x3, x3, x4 // src += lpf_stride - p_stride bl wiener_filter5_hv_8bpc_neon L(end_5): @@ -609,29 +601,29 @@ L(end_5): ret L(no_top_5): - add x3, x3, x4, lsl #2 - add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup + add x3, x3, x1, lsl #2 + add x16, x3, x1, lsl #1 // lpf += stride*6, backup mov x3, x0 // lpf = p bl wiener_filter5_h_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x11, x14 // t4 mov x12, x14 // t3 mov x13, x14 // t2 b.eq L(v1_5) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter5_h_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v2_5) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x15, x14, #384*2 // t0 = t1 + 384*2 bl wiener_filter5_hv_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v2_5) add x15, x15, #384*2*3 // t0 += 384*2*3 bl wiener_filter5_hv_8bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.ne L(main_5) L(v2_5): bl wiener_filter5_v_8bpc_neon @@ -646,11 +638,11 @@ endfunc function wiener_filter5_h_8bpc_neon - stp x3, x5, [sp, #-32]! + stp x3, x4, [sp, #-32]! str x14, [sp, #16] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w8, #1 // LR_HAVE_LEFT + tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f @@ -685,13 +677,13 @@ function wiener_filter5_h_8bpc_neon uxtl2 v3.8h, v3.16b uxtl v4.8h, v4.8b - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #18 + cmp w4, #18 b.ge 4f // If w >= 18, all used input pixels are valid // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, @@ -700,14 +692,14 @@ function wiener_filter5_h_8bpc_neon // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. - sub w17, w5, #23 + sub w17, w4, #23 // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the // buffer pointer. - movrel x7, right_ext_mask, -4 + movrel x6, right_ext_mask, -4 ldr b28, [x3, w17, sxtw] - sub x7, x7, w5, uxtw #1 + sub x6, x6, w4, uxtw #1 dup v28.8h, v28.h[0] - ld1 {v25.16b, v26.16b, v27.16b}, [x7] + ld1 {v25.16b, v26.16b, v27.16b}, [x6] bit v2.16b, v28.16b, v25.16b bit v3.16b, v28.16b, v26.16b @@ -748,14 +740,14 @@ function wiener_filter5_h_8bpc_neon add v6.8h, v6.8h, v31.8h add v7.8h, v7.8h, v31.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v6.8h, v7.8h}, [x14], #32 b.le 0f mov v2.16b, v4.16b ld1 {v4.16b}, [x3], #16 - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT uxtl v3.8h, v4.8b uxtl2 v4.8h, v4.16b b.ne 4b // If we don't need to pad, just keep filtering. @@ -763,14 +755,14 @@ function wiener_filter5_h_8bpc_neon 0: ldr x14, [sp, #16] - ldp x3, x5, [sp], #32 + ldp x3, x4, [sp], #32 ret endfunc function wiener_filter5_v_8bpc_neon stp x11, x12, [sp, #-48]! stp x13, x14, [sp, #16] - stp x0, x5, [sp, #32] + stp x0, x4, [sp, #32] 1: ld1 {v18.8h, v19.8h}, [x12], #32 ld1 {v22.8h, v23.8h}, [x14], #32 @@ -800,11 +792,11 @@ function wiener_filter5_v_8bpc_neon sqrshrun2 v3.8h, v5.4s, #11 sqxtun v2.8b, v2.8h sqxtun2 v2.16b, v3.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v2.16b}, [x0], #16 b.gt 1b - ldp x0, x5, [sp, #32] + ldp x0, x4, [sp, #32] ldp x13, x14, [sp, #16] ldp x11, x12, [sp], #48 @@ -817,10 +809,10 @@ function wiener_filter5_hv_8bpc_neon stp x12, x13, [sp, #-64]! stp x14, x15, [sp, #16] stp x12, x0, [sp, #32] - stp x3, x5, [sp, #48] + stp x3, x4, [sp, #48] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w8, #1 // LR_HAVE_LEFT + tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f @@ -854,13 +846,13 @@ function wiener_filter5_hv_8bpc_neon uxtl2 v3.8h, v3.16b uxtl v4.8h, v4.8b - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #18 + cmp w4, #18 b.ge 4f // If w >= 18, all used input pixels are valid // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, @@ -869,14 +861,14 @@ function wiener_filter5_hv_8bpc_neon // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. - sub w17, w5, #23 + sub w17, w4, #23 // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the // buffer pointer. - movrel x7, right_ext_mask, -4 + movrel x6, right_ext_mask, -4 ldr b28, [x3, w17, sxtw] - sub x7, x7, w5, uxtw #1 + sub x6, x6, w4, uxtw #1 dup v28.8h, v28.h[0] - ld1 {v25.16b, v26.16b, v27.16b}, [x7] + ld1 {v25.16b, v26.16b, v27.16b}, [x6] bit v2.16b, v28.16b, v25.16b bit v3.16b, v28.16b, v26.16b @@ -944,21 +936,21 @@ function wiener_filter5_hv_8bpc_neon st1 {v6.8h, v7.8h}, [x15], #32 sqxtun v18.8b, v18.8h sqxtun2 v18.16b, v19.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v18.16b}, [x0], #16 b.le 0f mov v2.16b, v4.16b ld1 {v4.16b}, [x3], #16 - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT uxtl v3.8h, v4.8b uxtl2 v4.8h, v4.16b b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: - ldp x3, x5, [sp, #48] + ldp x3, x4, [sp, #48] ldp x15, x0, [sp, #32] ldp x13, x14, [sp, #16] ldp x11, x12, [sp], #64 diff --git a/src/arm/64/looprestoration16.S b/src/arm/64/looprestoration16.S index 69efa4e..fcb4f84 100644 --- a/src/arm/64/looprestoration16.S +++ b/src/arm/64/looprestoration16.S @@ -45,36 +45,30 @@ right_ext_mask: endconst // void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride, -// const pixel (*left)[4], -// const pixel *lpf, const ptrdiff_t lpf_stride, +// const pixel (*left)[4], const pixel *lpf, // const int w, int h, // const int16_t filter[2][8], // const enum LrEdgeFlags edges, // const int bitdepth_max); function wiener_filter7_16bpc_neon, export=1 ldr w8, [sp] -#ifdef __APPLE__ - ldr w9, [sp, #4] -#else - ldr w9, [sp, #8] -#endif stp x29, x30, [sp, #-32]! stp d8, d9, [sp, #16] mov x29, sp - ld1 {v0.8h, v1.8h}, [x7] - tst w8, #4 // LR_HAVE_TOP + ld1 {v0.8h, v1.8h}, [x6] + tst w7, #4 // LR_HAVE_TOP sub_sp 384*2*6 - dup v28.8h, w9 // bitdepth_max - clz w9, w9 + dup v28.8h, w8 // bitdepth_max + clz w8, w8 movi v30.4s, #1 - sub w10, w9, #38 // -(bitdepth + 6) - sub w11, w9, #11 // round_bits_v - sub w9, w9, #25 // -round_bits_h + sub w10, w8, #38 // -(bitdepth + 6) + sub w11, w8, #11 // round_bits_v + sub w8, w8, #25 // -round_bits_h neg w10, w10 // bitdepth + 6 neg w11, w11 // -round_bits_v dup v2.4s, w10 - dup v29.4s, w9 // -round_bits_h + dup v29.4s, w8 // -round_bits_h dup v27.4s, w11 // -round_bits_v movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) @@ -94,50 +88,48 @@ function wiener_filter7_16bpc_neon, export=1 mov x16, x2 // backup left mov x2, #0 bl wiener_filter7_h_16bpc_neon - add x3, x3, x4 // lpf += lpf_stride + add x3, x3, x1 // lpf += stride mov x9, x14 // t6 mov x10, x14 // t5 add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_16bpc_neon - add x3, x3, x4, lsl #2 - add x3, x3, x4 // lpf += lpf_stride*5 + add x3, x3, x1, lsl #2 + add x3, x3, x1 // lpf += stride*5 mov x11, x14 // t4 add x14, x14, #384*2 // t1 += 384*2 mov x2, x16 // left mov x16, x3 // backup lpf mov x3, x0 // lpf = p bl wiener_filter7_h_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x12, x14 // t3 mov x13, x14 // t2 b.eq L(v1_7) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_16bpc_neon mov x13, x14 // t2 - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v2_7) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v3_7) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride L(main_7): add x15, x14, #384*2 // t0 = t1 + 384*2 L(main_loop_7): bl wiener_filter7_hv_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.ne L(main_loop_7) - tst w8, #8 // LR_HAVE_BOTTOM + tst w7, #8 // LR_HAVE_BOTTOM b.eq L(v3_7) mov x3, x16 // restore lpf mov x2, #0 // left = NULL - sub x4, x4, x1 // lpf_stride - p_stride bl wiener_filter7_hv_16bpc_neon - add x3, x3, x4 // src += lpf_stride - p_stride bl wiener_filter7_hv_16bpc_neon L(v1_7): bl wiener_filter7_v_16bpc_neon @@ -148,12 +140,12 @@ L(v1_7): ret L(no_top_7): - add x3, x3, x4, lsl #2 - add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup + add x3, x3, x1, lsl #2 + add x16, x3, x1, lsl #1 // lpf += stride*6, backup mov x3, x0 // lpf = p bl wiener_filter7_h_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x9, x14 // t6 mov x10, x14 // t5 mov x11, x14 // t4 @@ -163,22 +155,22 @@ L(no_top_7): add x3, x3, x1 // src += p_stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x13, x14 // t2 b.eq L(v2_7) add x3, x3, x1 // src += p_stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v3_7) add x3, x3, x1 // src += p_stride add x15, x14, #384*2 // t0 = t1 + 384*2 bl wiener_filter7_hv_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v3_7) add x15, x15, #384*2*4 // t0 += 384*2*4 bl wiener_filter7_hv_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.ne L(main_7) L(v3_7): bl wiener_filter7_v_16bpc_neon @@ -189,11 +181,11 @@ endfunc function wiener_filter7_h_16bpc_neon - stp x3, x5, [sp, #-32]! + stp x3, x4, [sp, #-32]! str x14, [sp, #16] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w8, #1 // LR_HAVE_LEFT + tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f @@ -227,13 +219,13 @@ function wiener_filter7_h_16bpc_neon 2: ld1 {v4.8h}, [x3], #16 - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #19 + cmp w4, #19 b.ge 4f // If w >= 19, all used input pixels are valid // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, @@ -242,14 +234,14 @@ function wiener_filter7_h_16bpc_neon // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. - sub w17, w5, #22 + sub w17, w4, #22 // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the // buffer pointer. - movrel x7, right_ext_mask, -6 + movrel x6, right_ext_mask, -6 ldr h26, [x3, w17, sxtw #1] - sub x7, x7, w5, uxtw #1 + sub x6, x6, w4, uxtw #1 dup v26.8h, v26.h[0] - ld1 {v23.16b, v24.16b, v25.16b}, [x7] + ld1 {v23.16b, v24.16b, v25.16b}, [x6] bit v2.16b, v26.16b, v23.16b bit v3.16b, v26.16b, v24.16b @@ -314,20 +306,20 @@ function wiener_filter7_h_16bpc_neon sub v6.8h, v6.8h, v31.8h sub v7.8h, v7.8h, v31.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v6.8h, v7.8h}, [x14], #32 b.le 0f mov v2.16b, v4.16b - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT ld1 {v3.8h, v4.8h}, [x3], #32 b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: ldr x14, [sp, #16] - ldp x3, x5, [sp], #32 + ldp x3, x4, [sp], #32 ret endfunc @@ -337,7 +329,7 @@ function wiener_filter7_v_16bpc_neon stp x10, x11, [sp, #-64]! stp x12, x13, [sp, #16] stp x14, x14, [sp, #32] - stp x0, x5, [sp, #48] + stp x0, x4, [sp, #48] 1: ld1 {v16.8h, v17.8h}, [x9], #32 ld1 {v18.8h, v19.8h}, [x10], #32 @@ -384,11 +376,11 @@ function wiener_filter7_v_16bpc_neon sqxtun2 v3.8h, v5.4s umin v2.8h, v2.8h, v28.8h // bitdepth_max umin v3.8h, v3.8h, v28.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v2.8h, v3.8h}, [x0], #32 b.gt 1b - ldp x0, x5, [sp, #48] + ldp x0, x4, [sp, #48] ldp x13, x14, [sp, #32] ldp x11, x12, [sp, #16] ldp x9, x10, [sp], #64 @@ -404,10 +396,10 @@ function wiener_filter7_hv_16bpc_neon stp x12, x13, [sp, #16] stp x14, x15, [sp, #32] stp x10, x0, [sp, #48] - stp x3, x5, [sp, #64] + stp x3, x4, [sp, #64] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w8, #1 // LR_HAVE_LEFT + tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f @@ -440,13 +432,13 @@ function wiener_filter7_hv_16bpc_neon 2: ld1 {v4.8h}, [x3], #16 - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #19 + cmp w4, #19 b.ge 4f // If w >= 19, all used input pixels are valid // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, @@ -455,14 +447,14 @@ function wiener_filter7_hv_16bpc_neon // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. - sub w17, w5, #22 + sub w17, w4, #22 // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the // buffer pointer. - movrel x7, right_ext_mask, -6 + movrel x6, right_ext_mask, -6 ldr h26, [x3, w17, sxtw #1] - sub x7, x7, w5, uxtw #1 + sub x6, x6, w4, uxtw #1 dup v26.8h, v26.h[0] - ld1 {v23.16b, v24.16b, v25.16b}, [x7] + ld1 {v23.16b, v24.16b, v25.16b}, [x6] bit v2.16b, v26.16b, v23.16b bit v3.16b, v26.16b, v24.16b @@ -571,19 +563,19 @@ function wiener_filter7_hv_16bpc_neon st1 {v6.8h, v7.8h}, [x15], #32 umin v18.8h, v18.8h, v28.8h // bitdepth_max umin v19.8h, v19.8h, v28.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v18.8h, v19.8h}, [x0], #32 b.le 0f mov v2.16b, v4.16b - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT ld1 {v3.8h, v4.8h}, [x3], #32 b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: - ldp x3, x5, [sp, #64] + ldp x3, x4, [sp, #64] ldp x15, x0, [sp, #48] ldp x13, x14, [sp, #32] ldp x11, x12, [sp, #16] @@ -596,36 +588,30 @@ function wiener_filter7_hv_16bpc_neon endfunc // void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride, -// const pixel (*left)[4], -// const pixel *lpf, const ptrdiff_t lpf_stride, +// const pixel (*left)[4], const pixel *lpf, // const int w, int h, // const int16_t filter[2][8], // const enum LrEdgeFlags edges, // const int bitdepth_max); function wiener_filter5_16bpc_neon, export=1 ldr w8, [sp] -#ifdef __APPLE__ - ldr w9, [sp, #4] -#else - ldr w9, [sp, #8] -#endif stp x29, x30, [sp, #-32]! stp d8, d9, [sp, #16] mov x29, sp - ld1 {v0.8h, v1.8h}, [x7] - tst w8, #4 // LR_HAVE_TOP + ld1 {v0.8h, v1.8h}, [x6] + tst w7, #4 // LR_HAVE_TOP sub_sp 384*2*4 - dup v28.8h, w9 // bitdepth_max - clz w9, w9 + dup v28.8h, w8 // bitdepth_max + clz w8, w8 movi v30.4s, #1 - sub w10, w9, #38 // -(bitdepth + 6) - sub w11, w9, #11 // round_bits_v - sub w9, w9, #25 // -round_bits_h + sub w10, w8, #38 // -(bitdepth + 6) + sub w11, w8, #11 // round_bits_v + sub w8, w8, #25 // -round_bits_h neg w10, w10 // bitdepth + 6 neg w11, w11 // -round_bits_v dup v2.4s, w10 - dup v29.4s, w9 // -round_bits_h + dup v29.4s, w8 // -round_bits_h dup v27.4s, w11 // -round_bits_v movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) @@ -643,42 +629,40 @@ function wiener_filter5_16bpc_neon, export=1 mov x16, x2 // backup left mov x2, #0 bl wiener_filter5_h_16bpc_neon - add x3, x3, x4 // lpf += lpf_stride + add x3, x3, x1 // lpf += stride mov x11, x14 // t4 add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter5_h_16bpc_neon - add x3, x3, x4, lsl #2 - add x3, x3, x4 // lpf += lpf_stride*5 + add x3, x3, x1, lsl #2 + add x3, x3, x1 // lpf += stride*5 mov x12, x14 // t3 add x14, x14, #384*2 // t1 += 384*2 mov x2, x16 // left mov x16, x3 // backup lpf mov x3, x0 // lpf = p bl wiener_filter5_h_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x13, x14 // t2 b.eq L(v1_5) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter5_h_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v2_5) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride L(main_5): mov x15, x11 // t0 = t4 L(main_loop_5): bl wiener_filter5_hv_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.ne L(main_loop_5) - tst w8, #8 // LR_HAVE_BOTTOM + tst w7, #8 // LR_HAVE_BOTTOM b.eq L(v2_5) mov x3, x16 // restore lpf mov x2, #0 // left = NULL - sub x4, x4, x1 // lpf_stride - p_stride bl wiener_filter5_hv_16bpc_neon - add x3, x3, x4 // src += lpf_stride - p_stride bl wiener_filter5_hv_16bpc_neon L(end_5): @@ -688,29 +672,29 @@ L(end_5): ret L(no_top_5): - add x3, x3, x4, lsl #2 - add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup + add x3, x3, x1, lsl #2 + add x16, x3, x1, lsl #1 // lpf += stride*6, backup mov x3, x0 // lpf = p bl wiener_filter5_h_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- mov x11, x14 // t4 mov x12, x14 // t3 mov x13, x14 // t2 b.eq L(v1_5) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter5_h_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v2_5) - add x3, x3, x1 // src += p_stride + add x3, x3, x1 // src += stride add x15, x14, #384*2 // t0 = t1 + 384*2 bl wiener_filter5_hv_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.eq L(v2_5) add x15, x15, #384*2*3 // t0 += 384*2*3 bl wiener_filter5_hv_16bpc_neon - subs w6, w6, #1 // h-- + subs w5, w5, #1 // h-- b.ne L(main_5) L(v2_5): bl wiener_filter5_v_16bpc_neon @@ -725,11 +709,11 @@ endfunc function wiener_filter5_h_16bpc_neon - stp x3, x5, [sp, #-32]! + stp x3, x4, [sp, #-32]! str x14, [sp, #16] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w8, #1 // LR_HAVE_LEFT + tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f @@ -763,13 +747,13 @@ function wiener_filter5_h_16bpc_neon 2: ld1 {v4.8h}, [x3], #16 - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #18 + cmp w4, #18 b.ge 4f // If w >= 18, all used input pixels are valid // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, @@ -778,14 +762,14 @@ function wiener_filter5_h_16bpc_neon // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. - sub w17, w5, #23 + sub w17, w4, #23 // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the // buffer pointer. - movrel x7, right_ext_mask, -4 + movrel x6, right_ext_mask, -4 ldr h26, [x3, w17, sxtw #1] - sub x7, x7, w5, uxtw #1 + sub x6, x6, w4, uxtw #1 dup v26.8h, v26.h[0] - ld1 {v23.16b, v24.16b, v25.16b}, [x7] + ld1 {v23.16b, v24.16b, v25.16b}, [x6] bit v2.16b, v26.16b, v23.16b bit v3.16b, v26.16b, v24.16b @@ -839,27 +823,27 @@ function wiener_filter5_h_16bpc_neon sub v6.8h, v6.8h, v31.8h sub v7.8h, v7.8h, v31.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v6.8h, v7.8h}, [x14], #32 b.le 0f mov v2.16b, v4.16b - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT ld1 {v3.8h, v4.8h}, [x3], #32 b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: ldr x14, [sp, #16] - ldp x3, x5, [sp], #32 + ldp x3, x4, [sp], #32 ret endfunc function wiener_filter5_v_16bpc_neon stp x11, x12, [sp, #-48]! stp x13, x14, [sp, #16] - stp x0, x5, [sp, #32] + stp x0, x4, [sp, #32] 1: ld1 {v16.8h, v17.8h}, [x11], #32 ld1 {v18.8h, v19.8h}, [x12], #32 @@ -897,11 +881,11 @@ function wiener_filter5_v_16bpc_neon umin v2.8h, v2.8h, v28.8h // bitdepth_max umin v3.8h, v3.8h, v28.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v2.8h, v3.8h}, [x0], #32 b.gt 1b - ldp x0, x5, [sp, #32] + ldp x0, x4, [sp, #32] ldp x13, x14, [sp, #16] ldp x11, x12, [sp], #48 @@ -914,10 +898,10 @@ function wiener_filter5_hv_16bpc_neon stp x12, x13, [sp, #-64]! stp x14, x15, [sp, #16] stp x12, x0, [sp, #32] - stp x3, x5, [sp, #48] + stp x3, x4, [sp, #48] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w8, #1 // LR_HAVE_LEFT + tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f @@ -950,13 +934,13 @@ function wiener_filter5_hv_16bpc_neon 2: ld1 {v4.8h}, [x3], #16 - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #18 + cmp w4, #18 b.ge 4f // If w >= 18, all used input pixels are valid // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, @@ -965,14 +949,14 @@ function wiener_filter5_hv_16bpc_neon // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. - sub w17, w5, #23 + sub w17, w4, #23 // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the // buffer pointer. - movrel x7, right_ext_mask, -4 + movrel x6, right_ext_mask, -4 ldr h26, [x3, w17, sxtw #1] - sub x7, x7, w5, uxtw #1 + sub x6, x6, w4, uxtw #1 dup v26.8h, v26.h[0] - ld1 {v23.16b, v24.16b, v25.16b}, [x7] + ld1 {v23.16b, v24.16b, v25.16b}, [x6] bit v2.16b, v26.16b, v23.16b bit v3.16b, v26.16b, v24.16b @@ -1059,19 +1043,19 @@ function wiener_filter5_hv_16bpc_neon umin v8.8h, v8.8h, v28.8h // bitdepth_max umin v9.8h, v9.8h, v28.8h - subs w5, w5, #16 + subs w4, w4, #16 st1 {v8.8h, v9.8h}, [x0], #32 b.le 0f mov v2.16b, v4.16b - tst w8, #2 // LR_HAVE_RIGHT + tst w7, #2 // LR_HAVE_RIGHT ld1 {v3.8h, v4.8h}, [x3], #32 b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: - ldp x3, x5, [sp, #48] + ldp x3, x4, [sp, #48] ldp x15, x0, [sp, #32] ldp x13, x14, [sp, #16] ldp x11, x12, [sp], #64 diff --git a/src/arm/looprestoration_init_tmpl.c b/src/arm/looprestoration_init_tmpl.c index 61584e6..5ba4bce 100644 --- a/src/arm/looprestoration_init_tmpl.c +++ b/src/arm/looprestoration_init_tmpl.c @@ -29,16 +29,14 @@ #include "src/looprestoration.h" #if ARCH_AARCH64 -void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t p_stride, - const pixel (*left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, +void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, const int w, int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); -void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t p_stride, - const pixel (*left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, +void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, const int w, int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges @@ -76,9 +74,8 @@ void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride, const int16_t fv[8], enum LrEdgeFlags edges, ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX); -static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride, - const pixel (*const left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, +static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) @@ -88,20 +85,20 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride, int mid_stride = (w + 7) & ~7; // Horizontal filter - BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, dst_stride, + BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, stride, filter[0], w, h, edges HIGHBD_TAIL_SUFFIX); if (edges & LR_HAVE_TOP) - BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, lpf_stride, + BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, stride, filter[0], w, 2, edges HIGHBD_TAIL_SUFFIX); if (edges & LR_HAVE_BOTTOM) BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL, - lpf + 6 * PXSTRIDE(lpf_stride), - lpf_stride, filter[0], w, 2, edges + lpf + 6 * PXSTRIDE(stride), + stride, filter[0], w, 2, edges HIGHBD_TAIL_SUFFIX); // Vertical filter - BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride], + BF(dav1d_wiener_filter_v, neon)(dst, stride, &mid[2*mid_stride], w, h, filter[1], edges, mid_stride * sizeof(*mid) HIGHBD_TAIL_SUFFIX); @@ -127,8 +124,7 @@ void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp, /* filter with a 3x3 box (radius=1) */ static void dav1d_sgr_filter1_neon(int16_t *tmp, const pixel *src, const ptrdiff_t stride, - const pixel (*left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, + const pixel (*left)[4], const pixel *lpf, const int w, const int h, const int strength, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) @@ -141,12 +137,12 @@ static void dav1d_sgr_filter1_neon(int16_t *tmp, BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges); if (edges & LR_HAVE_TOP) BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], - NULL, lpf, lpf_stride, w, 2, edges); + NULL, lpf, stride, w, 2, edges); if (edges & LR_HAVE_BOTTOM) BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], - NULL, lpf + 6 * PXSTRIDE(lpf_stride), - lpf_stride, w, 2, edges); + NULL, lpf + 6 * PXSTRIDE(stride), + stride, w, 2, edges); dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges); dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX); @@ -172,8 +168,7 @@ void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp, /* filter with a 5x5 box (radius=2) */ static void dav1d_sgr_filter2_neon(int16_t *tmp, const pixel *src, const ptrdiff_t stride, - const pixel (*left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, + const pixel (*left)[4], const pixel *lpf, const int w, const int h, const int strength, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) @@ -186,12 +181,12 @@ static void dav1d_sgr_filter2_neon(int16_t *tmp, BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges); if (edges & LR_HAVE_TOP) BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], - NULL, lpf, lpf_stride, w, 2, edges); + NULL, lpf, stride, w, 2, edges); if (edges & LR_HAVE_BOTTOM) BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], - NULL, lpf + 6 * PXSTRIDE(lpf_stride), - lpf_stride, w, 2, edges); + NULL, lpf + 6 * PXSTRIDE(stride), + stride, w, 2, edges); dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges); dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX); @@ -208,49 +203,46 @@ void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride, const int w, const int h, const int16_t wt[2] HIGHBD_DECL_SUFFIX); -static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t dst_stride, - const pixel (*const left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, +static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { ALIGN_STK_16(int16_t, tmp, 64 * 384,); - dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride, + dav1d_sgr_filter2_neon(tmp, dst, stride, left, lpf, w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX); - BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride, + BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride, tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX); } -static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t dst_stride, - const pixel (*const left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, +static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { ALIGN_STK_16(int16_t, tmp, 64 * 384,); - dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride, + dav1d_sgr_filter1_neon(tmp, dst, stride, left, lpf, w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX); - BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride, + BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride, tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX); } -static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t dst_stride, - const pixel (*const left)[4], - const pixel *lpf, const ptrdiff_t lpf_stride, +static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride, + const pixel (*const left)[4], const pixel *lpf, const int w, const int h, const LooprestorationParams *const params, const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { ALIGN_STK_16(int16_t, tmp1, 64 * 384,); ALIGN_STK_16(int16_t, tmp2, 64 * 384,); - dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride, + dav1d_sgr_filter2_neon(tmp1, dst, stride, left, lpf, w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX); - dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride, + dav1d_sgr_filter1_neon(tmp2, dst, stride, left, lpf, w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX); const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 }; - BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride, + BF(dav1d_sgr_weighted2, neon)(dst, stride, dst, stride, tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX); } |