Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2019-02-25 00:55:47 +0300
committerJean-Baptiste Kempf <jb@videolan.org>2019-04-16 13:31:49 +0300
commit204bf2115ff91d1a594ab47c7d8adaf2b1b80b07 (patch)
tree0c6acb39daec4b670ac4d2fb0ed79b41dd238be5 /src/arm/64/looprestoration.S
parent003fa104596d3bf9ebde5874359ff5712fafb730 (diff)
arm64: looprestoration: Add a NEON implementation of SGR
Relative speedup vs (autovectorized) C code: Cortex A53 A72 A73 selfguided_3x3_8bpc_neon: 2.91 2.12 2.68 selfguided_5x5_8bpc_neon: 3.18 2.65 3.39 selfguided_mix_8bpc_neon: 3.04 2.29 2.98 The relative speedup vs non-vectorized C code is around 2.6-4.6x.
Diffstat (limited to 'src/arm/64/looprestoration.S')
-rw-r--r--src/arm/64/looprestoration.S1372
1 files changed, 1372 insertions, 0 deletions
diff --git a/src/arm/64/looprestoration.S b/src/arm/64/looprestoration.S
index 7fc34d9..3591f3b 100644
--- a/src/arm/64/looprestoration.S
+++ b/src/arm/64/looprestoration.S
@@ -26,6 +26,7 @@
*/
#include "src/arm/asm.S"
+#include "util.S"
// void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
// const pixel *src, ptrdiff_t stride,
@@ -613,3 +614,1374 @@ L(copy_narrow_tbl):
.hword L(copy_narrow_tbl) - 60b
.hword L(copy_narrow_tbl) - 70b
endfunc
+
+#define SUM_STRIDE (384+16)
+
+// void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_h_neon, export=1
+ add w5, w5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add x10, x0, #(4*SUM_STRIDE) // sumsq
+ add x11, x1, #(2*SUM_STRIDE) // sum
+ add x12, x3, x4 // src
+ lsl x4, x4, #1
+ mov x9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 0f
+ // !LR_HAVE_RIGHT
+ add w13, w5, #3
+ bic w13, w13, #3
+ b 1f
+0:
+ add w13, w5, #7
+ bic w13, w13, #7
+1:
+ sub x9, x9, w13, uxtw #1
+
+ // Store the width for the vertical loop
+ mov w8, w5
+
+ // Subtract the number of pixels read from the input from the stride
+ add w13, w5, #14
+ bic w13, w13, #7
+ sub x4, x4, w13, uxtw
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 2f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #2
+ sub x12, x12, #2
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 2 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add x4, x4, #2
+
+
+1: // Loop vertically
+ ld1 {v0.16b}, [x3], #16
+ ld1 {v4.16b}, [x12], #16
+
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 0f
+ cbz x2, 2f
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v1.s}[3], [x2], #4
+ // Move x3/x12 back to account for the last 2 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #2
+ sub x12, x12, #2
+ ld1 {v5.s}[3], [x2], #4
+ ext v0.16b, v1.16b, v0.16b, #14
+ ext v4.16b, v5.16b, v4.16b, #14
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill v1 with the leftmost byte
+ // and shift v0 to have 2x the first byte at the front.
+ dup v1.16b, v0.b[0]
+ dup v5.16b, v4.b[0]
+ // Move x3 back to account for the last 2 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #2
+ sub x12, x12, #2
+ ext v0.16b, v1.16b, v0.16b, #14
+ ext v4.16b, v5.16b, v4.16b, #14
+
+2:
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+ umull v5.8h, v4.8b, v4.8b
+ umull2 v6.8h, v4.16b, v4.16b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w5, #(2 + 16 - 2 + 1)
+ ldr b30, [x3, w13, sxtw]
+ ldr b31, [x12, w13, sxtw]
+ // Fill v30/v31 with the right padding pixel
+ dup v30.8b, v30.b[0]
+ dup v31.8b, v31.b[0]
+3: // !LR_HAVE_RIGHT
+ // If we'll have to pad the right edge we need to quit early here.
+ cmp w5, #10
+ b.ge 4f // If w >= 10, all used input pixels are valid
+ cmp w5, #6
+ b.ge 5f // If w >= 6, we can filter 4 pixels
+ b 6f
+
+4: // Loop horizontally
+.macro uaddl_nh dst1, dst2, src1, src2, w
+ uaddl \dst1, \src1\().4h, \src2\().4h
+.if \w > 4
+ uaddl2 \dst2, \src1\().8h, \src2\().8h
+.endif
+.endm
+.macro uaddw_nh dst1, dst2, src, w
+ uaddw \dst1, \dst1, \src\().4h
+.if \w > 4
+ uaddw2 \dst2, \dst2, \src\().8h
+.endif
+.endm
+.macro add_nh dst1, dst2, src1, src2, w
+ add \dst1, \dst1, \src1
+.if \w > 4
+ add \dst2, \dst2, \src2
+.endif
+.endm
+
+.macro add3 w
+ ext v16.16b, v0.16b, v0.16b, #1
+ ext v17.16b, v0.16b, v0.16b, #2
+ ext v18.16b, v4.16b, v4.16b, #1
+ ext v19.16b, v4.16b, v4.16b, #2
+ uaddl v3.8h, v0.8b, v16.8b
+ uaddw v3.8h, v3.8h, v17.8b
+ uaddl v7.8h, v4.8b, v18.8b
+ uaddw v7.8h, v7.8h, v19.8b
+
+ ext v20.16b, v1.16b, v2.16b, #2
+ ext v21.16b, v1.16b, v2.16b, #4
+ ext v22.16b, v5.16b, v6.16b, #2
+ ext v23.16b, v5.16b, v6.16b, #4
+
+ uaddl_nh v26.4s, v27.4s, v1, v20, \w
+ uaddw_nh v26.4s, v27.4s, v21, \w
+
+ uaddl_nh v28.4s, v29.4s, v5, v22, \w
+ uaddw_nh v28.4s, v29.4s, v23, \w
+.endm
+ add3 8
+ st1 {v3.8h}, [x1], #16
+ st1 {v7.8h}, [x11], #16
+ st1 {v26.4s,v27.4s}, [x0], #32
+ st1 {v28.4s,v29.4s}, [x10], #32
+
+ subs w5, w5, #8
+ b.le 9f
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8b}, [x3], #8
+ ld1 {v7.8b}, [x12], #8
+ mov v1.16b, v2.16b
+ mov v5.16b, v6.16b
+ ext v0.16b, v0.16b, v3.16b, #8
+ ext v4.16b, v4.16b, v7.16b, #8
+ umull v2.8h, v3.8b, v3.8b
+ umull v6.8h, v7.8b, v7.8b
+
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+5: // Produce 4 pixels, 6 <= w < 10
+ add3 4
+ st1 {v3.4h}, [x1], #8
+ st1 {v7.4h}, [x11], #8
+ st1 {v26.4s}, [x0], #16
+ st1 {v28.4s}, [x10], #16
+
+ subs w5, w5, #4 // 2 <= w < 6
+ ext v0.16b, v0.16b, v0.16b, #4
+ ext v4.16b, v4.16b, v4.16b, #4
+
+6: // Pad the right edge and produce the last few pixels.
+ // 2 <= w < 6, 2-5 pixels valid in v0
+ sub w13, w5, #2
+ // w13 = (pixels valid - 2)
+ adr x14, L(box3_variable_shift_tbl)
+ ldrh w13, [x14, w13, uxtw #1]
+ sub x13, x14, w13, uxth
+ br x13
+ // Shift v0 right, shifting out invalid pixels,
+ // shift v0 left to the original offset, shifting in padding pixels.
+22: // 2 pixels valid
+ ext v0.16b, v0.16b, v0.16b, #2
+ ext v4.16b, v4.16b, v4.16b, #2
+ ext v0.16b, v0.16b, v30.16b, #14
+ ext v4.16b, v4.16b, v31.16b, #14
+ b 88f
+33: // 3 pixels valid
+ ext v0.16b, v0.16b, v0.16b, #3
+ ext v4.16b, v4.16b, v4.16b, #3
+ ext v0.16b, v0.16b, v30.16b, #13
+ ext v4.16b, v4.16b, v31.16b, #13
+ b 88f
+44: // 4 pixels valid
+ ext v0.16b, v0.16b, v0.16b, #4
+ ext v4.16b, v4.16b, v4.16b, #4
+ ext v0.16b, v0.16b, v30.16b, #12
+ ext v4.16b, v4.16b, v31.16b, #12
+ b 88f
+55: // 5 pixels valid
+ ext v0.16b, v0.16b, v0.16b, #5
+ ext v4.16b, v4.16b, v4.16b, #5
+ ext v0.16b, v0.16b, v30.16b, #11
+ ext v4.16b, v4.16b, v31.16b, #11
+ b 88f
+
+L(box3_variable_shift_tbl):
+ .hword L(box3_variable_shift_tbl) - 22b
+ .hword L(box3_variable_shift_tbl) - 33b
+ .hword L(box3_variable_shift_tbl) - 44b
+ .hword L(box3_variable_shift_tbl) - 55b
+
+88:
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+ umull v5.8h, v4.8b, v4.8b
+ umull2 v6.8h, v4.16b, v4.16b
+
+ add3 4
+ st1 {v3.4h}, [x1], #8
+ st1 {v7.4h}, [x11], #8
+ st1 {v26.4s}, [x0], #16
+ st1 {v28.4s}, [x10], #16
+ subs w5, w5, #4
+ b.le 9f
+ ext v0.16b, v0.16b, v0.16b, #4
+ ext v4.16b, v4.16b, v4.16b, #4
+ ext v1.16b, v1.16b, v2.16b, #8
+ ext v5.16b, v5.16b, v6.16b, #8
+ // Only one needed pixel left, but do a normal 4 pixel
+ // addition anyway
+ add3 4
+ st1 {v3.4h}, [x1], #8
+ st1 {v7.4h}, [x11], #8
+ st1 {v26.4s}, [x0], #16
+ st1 {v28.4s}, [x10], #16
+
+9:
+ subs w6, w6, #2
+ b.le 0f
+ // Jump to the next row and loop horizontally
+ add x0, x0, x9, lsl #1
+ add x10, x10, x9, lsl #1
+ add x1, x1, x9
+ add x11, x11, x9
+ add x3, x3, x4
+ add x12, x12, x4
+ mov w5, w8
+ b 1b
+0:
+ ret
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
+// const pixel (*left)[4],
+// const pixel *src, const ptrdiff_t stride,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_h_neon, export=1
+ add w5, w5, #2 // w += 2
+
+ // Set up pointers for reading/writing alternate rows
+ add x10, x0, #(4*SUM_STRIDE) // sumsq
+ add x11, x1, #(2*SUM_STRIDE) // sum
+ add x12, x3, x4 // src
+ lsl x4, x4, #1
+ mov x9, #(2*2*SUM_STRIDE) // double sum stride
+
+ // Subtract the aligned width from the output stride.
+ // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
+ // Subtract the number of pixels read from the input from the stride.
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 0f
+ // !LR_HAVE_RIGHT
+ add w13, w5, #3
+ bic w13, w13, #3
+ add w14, w5, #13
+ b 1f
+0:
+ add w13, w5, #7
+ bic w13, w13, #7
+ add w14, w5, #15
+1:
+ sub x9, x9, w13, uxtw #1
+ bic w14, w14, #7
+ sub x4, x4, w14, uxtw
+
+ // Store the width for the vertical loop
+ mov w8, w5
+
+ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 2f
+ // LR_HAVE_LEFT
+ cbnz x2, 0f
+ // left == NULL
+ sub x3, x3, #3
+ sub x12, x12, #3
+ b 1f
+0: // LR_HAVE_LEFT, left != NULL
+2: // !LR_HAVE_LEFT, increase the stride.
+ // For this case we don't read the left 3 pixels from the src pointer,
+ // but shift it as if we had done that.
+ add x4, x4, #3
+
+1: // Loop vertically
+ ld1 {v0.16b}, [x3], #16
+ ld1 {v4.16b}, [x12], #16
+
+ tst w7, #1 // LR_HAVE_LEFT
+ b.eq 0f
+ cbz x2, 2f
+ // LR_HAVE_LEFT, left != NULL
+ ld1 {v1.s}[3], [x2], #4
+ // Move x3/x12 back to account for the last 3 bytes we loaded earlier,
+ // which we'll shift out.
+ sub x3, x3, #3
+ sub x12, x12, #3
+ ld1 {v5.s}[3], [x2], #4
+ ext v0.16b, v1.16b, v0.16b, #13
+ ext v4.16b, v5.16b, v4.16b, #13
+ b 2f
+0:
+ // !LR_HAVE_LEFT, fill v2 with the leftmost byte
+ // and shift v3 to have 2x the first byte at the front.
+ dup v1.16b, v0.b[0]
+ dup v5.16b, v4.b[0]
+ // Move x3 back to account for the last 3 bytes we loaded before,
+ // which we shifted out.
+ sub x3, x3, #3
+ sub x12, x12, #3
+ ext v0.16b, v1.16b, v0.16b, #13
+ ext v4.16b, v5.16b, v4.16b, #13
+
+2:
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+ umull v5.8h, v4.8b, v4.8b
+ umull2 v6.8h, v4.16b, v4.16b
+
+ tst w7, #2 // LR_HAVE_RIGHT
+ b.ne 4f
+ // If we'll need to pad the right edge, load that byte to pad with
+ // here since we can find it pretty easily from here.
+ sub w13, w5, #(2 + 16 - 3 + 1)
+ ldr b30, [x3, w13, sxtw]
+ ldr b31, [x12, w13, sxtw]
+ // Fill v30/v31 with the right padding pixel
+ dup v30.8b, v30.b[0]
+ dup v31.8b, v31.b[0]
+3: // !LR_HAVE_RIGHT
+ // If we'll have to pad the right edge we need to quit early here.
+ cmp w5, #11
+ b.ge 4f // If w >= 11, all used input pixels are valid
+ cmp w5, #7
+ b.ge 5f // If w >= 7, we can produce 4 pixels
+ b 6f
+
+4: // Loop horizontally
+.macro add5 w
+ ext v16.16b, v0.16b, v0.16b, #1
+ ext v17.16b, v0.16b, v0.16b, #2
+ ext v18.16b, v0.16b, v0.16b, #3
+ ext v19.16b, v0.16b, v0.16b, #4
+ ext v20.16b, v4.16b, v4.16b, #1
+ ext v21.16b, v4.16b, v4.16b, #2
+ ext v22.16b, v4.16b, v4.16b, #3
+ ext v23.16b, v4.16b, v5.16b, #4
+ uaddl v3.8h, v0.8b, v16.8b
+ uaddl v24.8h, v17.8b, v18.8b
+ uaddl v7.8h, v4.8b, v20.8b
+ uaddw v3.8h, v3.8h, v19.8b
+ uaddl v25.8h, v21.8b, v22.8b
+ uaddw v7.8h, v7.8h, v23.8b
+ add v3.8h, v3.8h, v24.8h
+ add v7.8h, v7.8h, v25.8h
+
+ ext v16.16b, v1.16b, v2.16b, #2
+ ext v17.16b, v1.16b, v2.16b, #4
+ ext v18.16b, v1.16b, v2.16b, #6
+ ext v19.16b, v1.16b, v2.16b, #8
+ ext v20.16b, v5.16b, v6.16b, #2
+ ext v21.16b, v5.16b, v6.16b, #4
+ ext v22.16b, v5.16b, v6.16b, #6
+ ext v23.16b, v5.16b, v6.16b, #8
+
+ uaddl_nh v26.4s, v27.4s, v1, v16, \w
+ uaddl_nh v16.4s, v17.4s, v17, v18, \w
+ uaddl_nh v28.4s, v29.4s, v5, v20, \w
+ uaddw_nh v26.4s, v27.4s, v19, \w
+ uaddl_nh v20.4s, v21.4s, v21, v22, \w
+ uaddw_nh v28.4s, v29.4s, v23, \w
+ add_nh v26.4s, v27.4s, v16.4s, v17.4s, \w
+ add_nh v28.4s, v29.4s, v20.4s, v21.4s, \w
+.endm
+ add5 8
+ st1 {v3.8h}, [x1], #16
+ st1 {v7.8h}, [x11], #16
+ st1 {v26.4s,v27.4s}, [x0], #32
+ st1 {v28.4s,v29.4s}, [x10], #32
+
+ subs w5, w5, #8
+ b.le 9f
+ tst w7, #2 // LR_HAVE_RIGHT
+ ld1 {v3.8b}, [x3], #8
+ ld1 {v7.8b}, [x12], #8
+ mov v1.16b, v2.16b
+ mov v5.16b, v6.16b
+ ext v0.16b, v0.16b, v3.16b, #8
+ ext v4.16b, v4.16b, v7.16b, #8
+ umull v2.8h, v3.8b, v3.8b
+ umull v6.8h, v7.8b, v7.8b
+ b.ne 4b // If we don't need to pad, just keep summing.
+ b 3b // If we need to pad, check how many pixels we have left.
+
+5: // Produce 4 pixels, 7 <= w < 11
+ add5 4
+ st1 {v3.4h}, [x1], #8
+ st1 {v7.4h}, [x11], #8
+ st1 {v26.4s}, [x0], #16
+ st1 {v28.4s}, [x10], #16
+
+ subs w5, w5, #4 // 3 <= w < 7
+ ext v0.16b, v0.16b, v0.16b, #4
+ ext v4.16b, v4.16b, v4.16b, #4
+
+6: // Pad the right edge and produce the last few pixels.
+ // w < 7, w+1 pixels valid in v3/v5
+ sub w13, w5, #1
+ // w13 = pixels valid - 2
+ adr x14, L(box5_variable_shift_tbl)
+ ldrh w13, [x14, w13, uxtw #1]
+ sub x13, x14, w13, uxth
+ br x13
+ // Shift v3 right, shifting out invalid pixels,
+ // shift v3 left to the original offset, shifting in padding pixels.
+22: // 2 pixels valid
+ ext v0.16b, v0.16b, v0.16b, #2
+ ext v4.16b, v4.16b, v4.16b, #2
+ ext v0.16b, v0.16b, v30.16b, #14
+ ext v4.16b, v4.16b, v31.16b, #14
+ b 88f
+33: // 3 pixels valid
+ ext v0.16b, v0.16b, v0.16b, #3
+ ext v4.16b, v4.16b, v4.16b, #3
+ ext v0.16b, v0.16b, v30.16b, #13
+ ext v4.16b, v4.16b, v31.16b, #13
+ b 88f
+44: // 4 pixels valid
+ ext v0.16b, v0.16b, v0.16b, #4
+ ext v4.16b, v4.16b, v4.16b, #4
+ ext v0.16b, v0.16b, v30.16b, #12
+ ext v4.16b, v4.16b, v31.16b, #12
+ b 88f
+55: // 5 pixels valid
+ ext v0.16b, v0.16b, v0.16b, #5
+ ext v4.16b, v4.16b, v4.16b, #5
+ ext v0.16b, v0.16b, v30.16b, #11
+ ext v4.16b, v4.16b, v31.16b, #11
+ b 88f
+66: // 6 pixels valid
+ ext v0.16b, v0.16b, v0.16b, #6
+ ext v4.16b, v4.16b, v4.16b, #6
+ ext v0.16b, v0.16b, v30.16b, #10
+ ext v4.16b, v4.16b, v31.16b, #10
+ b 88f
+77: // 7 pixels valid
+ ext v0.16b, v0.16b, v0.16b, #7
+ ext v4.16b, v4.16b, v4.16b, #7
+ ext v0.16b, v0.16b, v30.16b, #9
+ ext v4.16b, v4.16b, v31.16b, #9
+ b 88f
+
+L(box5_variable_shift_tbl):
+ .hword L(box5_variable_shift_tbl) - 22b
+ .hword L(box5_variable_shift_tbl) - 33b
+ .hword L(box5_variable_shift_tbl) - 44b
+ .hword L(box5_variable_shift_tbl) - 55b
+ .hword L(box5_variable_shift_tbl) - 66b
+ .hword L(box5_variable_shift_tbl) - 77b
+
+88:
+ umull v1.8h, v0.8b, v0.8b
+ umull2 v2.8h, v0.16b, v0.16b
+ umull v5.8h, v4.8b, v4.8b
+ umull2 v6.8h, v4.16b, v4.16b
+
+ add5 4
+ st1 {v3.4h}, [x1], #8
+ st1 {v7.4h}, [x11], #8
+ st1 {v26.4s}, [x0], #16
+ st1 {v28.4s}, [x10], #16
+ subs w5, w5, #4
+ b.le 9f
+ ext v0.16b, v0.16b, v0.16b, #4
+ ext v1.16b, v1.16b, v2.16b, #8
+ ext v4.16b, v4.16b, v4.16b, #4
+ ext v5.16b, v5.16b, v6.16b, #8
+ add5 4
+ st1 {v3.4h}, [x1], #8
+ st1 {v7.4h}, [x11], #8
+ st1 {v26.4s}, [x0], #16
+ st1 {v28.4s}, [x10], #16
+
+9:
+ subs w6, w6, #2
+ b.le 0f
+ // Jump to the next row and loop horizontally
+ add x0, x0, x9, lsl #1
+ add x10, x10, x9, lsl #1
+ add x1, x1, x9
+ add x11, x11, x9
+ add x3, x3, x4
+ add x12, x12, x4
+ mov w5, w8
+ b 1b
+0:
+ ret
+.purgem add5
+endfunc
+
+// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_v_neon, export=1
+ add w10, w3, #2 // Number of output rows to move back
+ mov w11, w3 // Number of input rows to move back
+ add w2, w2, #2 // Actual summed width
+ mov x7, #(4*SUM_STRIDE) // sumsq stride
+ mov x8, #(2*SUM_STRIDE) // sum stride
+ sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
+ sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
+
+ tst w4, #4 // LR_HAVE_TOP
+ b.eq 0f
+ // If have top, read from row -2.
+ sub x5, x0, #(4*SUM_STRIDE)
+ sub x6, x1, #(2*SUM_STRIDE)
+ add w11, w11, #2
+ b 1f
+0:
+ // !LR_HAVE_TOP
+ // If we don't have top, read from row 0 even if
+ // we start writing to row -1.
+ add x5, x0, #(4*SUM_STRIDE)
+ add x6, x1, #(2*SUM_STRIDE)
+1:
+
+ tst w4, #8 // LR_HAVE_BOTTOM
+ b.eq 1f
+ // LR_HAVE_BOTTOM
+ add w3, w3, #2 // Sum all h+2 lines with the main loop
+ add w11, w11, #2
+1:
+ mov w9, w3 // Backup of h for next loops
+
+1:
+ // Start of horizontal loop; start one vertical filter slice.
+ // Start loading rows into v16-v21 and v24-v26 taking top
+ // padding into consideration.
+ tst w4, #4 // LR_HAVE_TOP
+ ld1 {v16.4s, v17.4s}, [x5], x7
+ ld1 {v24.8h}, [x6], x8
+ b.eq 2f
+ // LR_HAVE_TOP
+ ld1 {v18.4s, v19.4s}, [x5], x7
+ ld1 {v25.8h}, [x6], x8
+ ld1 {v20.4s, v21.4s}, [x5], x7
+ ld1 {v26.8h}, [x6], x8
+ b 3f
+2: // !LR_HAVE_TOP
+ mov v18.16b, v16.16b
+ mov v19.16b, v17.16b
+ mov v25.16b, v24.16b
+ mov v20.16b, v16.16b
+ mov v21.16b, v17.16b
+ mov v26.16b, v24.16b
+
+3:
+ subs w3, w3, #1
+.macro add3
+ add v16.4s, v16.4s, v18.4s
+ add v17.4s, v17.4s, v19.4s
+ add v24.8h, v24.8h, v25.8h
+ add v16.4s, v16.4s, v20.4s
+ add v17.4s, v17.4s, v21.4s
+ add v24.8h, v24.8h, v26.8h
+ st1 {v16.4s, v17.4s}, [x0], x7
+ st1 {v24.8h}, [x1], x8
+.endm
+ add3
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v24.16b, v25.16b
+ mov v18.16b, v20.16b
+ mov v19.16b, v21.16b
+ mov v25.16b, v26.16b
+ b.le 4f
+ ld1 {v20.4s, v21.4s}, [x5], x7
+ ld1 {v26.8h}, [x6], x8
+ b 3b
+
+4:
+ tst w4, #8 // LR_HAVE_BOTTOM
+ b.ne 5f
+ // !LR_HAVE_BOTTOM
+ // Produce two more rows, extending the already loaded rows.
+ add3
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v24.16b, v25.16b
+ add3
+
+5: // End of one vertical slice.
+ subs w2, w2, #8
+ b.le 0f
+ // Move pointers back up to the top and loop horizontally.
+ // Input pointers
+ msub x5, x7, x11, x5
+ msub x6, x8, x11, x6
+ // Output pointers
+ msub x0, x7, x10, x0
+ msub x1, x8, x10, x1
+ add x0, x0, #32
+ add x1, x1, #16
+ add x5, x5, #32
+ add x6, x6, #16
+ mov w3, w9
+ b 1b
+
+0:
+ ret
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_v_neon, export=1
+ add w10, w3, #2 // Number of output rows to move back
+ mov w11, w3 // Number of input rows to move back
+ add w2, w2, #8 // Actual summed width
+ mov x7, #(4*SUM_STRIDE) // sumsq stride
+ mov x8, #(2*SUM_STRIDE) // sum stride
+ sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
+ sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
+
+ tst w4, #4 // LR_HAVE_TOP
+ b.eq 0f
+ // If have top, read from row -2.
+ sub x5, x0, #(4*SUM_STRIDE)
+ sub x6, x1, #(2*SUM_STRIDE)
+ add w11, w11, #2
+ b 1f
+0:
+ // !LR_HAVE_TOP
+ // If we don't have top, read from row 0 even if
+ // we start writing to row -1.
+ add x5, x0, #(4*SUM_STRIDE)
+ add x6, x1, #(2*SUM_STRIDE)
+1:
+
+ tst w4, #8 // LR_HAVE_BOTTOM
+ b.eq 0f
+ // LR_HAVE_BOTTOM
+ add w3, w3, #2 // Handle h+2 lines with the main loop
+ add w11, w11, #2
+ b 1f
+0:
+ // !LR_HAVE_BOTTOM
+ sub w3, w3, #1 // Handle h-1 lines with the main loop
+1:
+ mov w9, w3 // Backup of h for next loops
+
+1:
+ // Start of horizontal loop; start one vertical filter slice.
+ // Start loading rows into v16-v25 and v26-v30 taking top
+ // padding into consideration.
+ tst w4, #4 // LR_HAVE_TOP
+ ld1 {v16.4s, v17.4s}, [x5], x7
+ ld1 {v26.8h}, [x6], x8
+ b.eq 2f
+ // LR_HAVE_TOP
+ ld1 {v20.4s, v21.4s}, [x5], x7
+ ld1 {v28.8h}, [x6], x8
+ mov v18.16b, v16.16b
+ mov v19.16b, v17.16b
+ mov v27.16b, v26.16b
+ ld1 {v22.4s, v23.4s}, [x5], x7
+ ld1 {v29.8h}, [x6], x8
+ b 3f
+2: // !LR_HAVE_TOP
+ mov v18.16b, v16.16b
+ mov v19.16b, v17.16b
+ mov v27.16b, v26.16b
+ mov v20.16b, v16.16b
+ mov v21.16b, v17.16b
+ mov v28.16b, v26.16b
+ mov v22.16b, v16.16b
+ mov v23.16b, v17.16b
+ mov v29.16b, v26.16b
+
+3:
+ cbz w3, 4f
+ ld1 {v24.4s, v25.4s}, [x5], x7
+ ld1 {v30.8h}, [x6], x8
+
+3:
+ // Start of vertical loop
+ subs w3, w3, #2
+.macro add5
+ add v16.4s, v16.4s, v18.4s
+ add v17.4s, v17.4s, v19.4s
+ add v26.8h, v26.8h, v27.8h
+ add v0.4s, v20.4s, v22.4s
+ add v1.4s, v21.4s, v23.4s
+ add v2.8h, v28.8h, v29.8h
+ add v16.4s, v16.4s, v24.4s
+ add v17.4s, v17.4s, v25.4s
+ add v26.8h, v26.8h, v30.8h
+ add v16.4s, v16.4s, v0.4s
+ add v17.4s, v17.4s, v1.4s
+ add v26.8h, v26.8h, v2.8h
+ st1 {v16.4s, v17.4s}, [x0], x7
+ st1 {v26.8h}, [x1], x8
+.endm
+ add5
+.macro shift2
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ mov v26.16b, v28.16b
+ mov v18.16b, v22.16b
+ mov v19.16b, v23.16b
+ mov v27.16b, v29.16b
+ mov v20.16b, v24.16b
+ mov v21.16b, v25.16b
+ mov v28.16b, v30.16b
+.endm
+ shift2
+ add x0, x0, x7
+ add x1, x1, x8
+ b.le 5f
+ ld1 {v22.4s, v23.4s}, [x5], x7
+ ld1 {v29.8h}, [x6], x8
+ ld1 {v24.4s, v25.4s}, [x5], x7
+ ld1 {v30.8h}, [x6], x8
+ b 3b
+
+4:
+ // h == 1, !LR_HAVE_BOTTOM.
+ // Pad the last row with the only content row, and add.
+ mov v24.16b, v22.16b
+ mov v25.16b, v23.16b
+ mov v30.16b, v29.16b
+ add5
+ shift2
+ add x0, x0, x7
+ add x1, x1, x8
+ add5
+ b 6f
+
+5:
+ tst w4, #8 // LR_HAVE_BOTTOM
+ b.ne 6f
+ // !LR_HAVE_BOTTOM
+ cbnz w3, 5f
+ // The intended three edge rows left; output the one at h-2 and
+ // the past edge one at h.
+ ld1 {v22.4s, v23.4s}, [x5], x7
+ ld1 {v29.8h}, [x6], x8
+ // Pad the past-edge row from the last content row.
+ mov v24.16b, v22.16b
+ mov v25.16b, v23.16b
+ mov v30.16b, v29.16b
+ add5
+ shift2
+ add x0, x0, x7
+ add x1, x1, x8
+ // The last two rows are already padded properly here.
+ add5
+ b 6f
+
+5:
+ // w3 == -1, two rows left, output one.
+ // Pad the last two rows from the mid one.
+ mov v22.16b, v20.16b
+ mov v23.16b, v21.16b
+ mov v29.16b, v28.16b
+ mov v24.16b, v20.16b
+ mov v25.16b, v21.16b
+ mov v30.16b, v28.16b
+ add5
+ add x0, x0, x7
+ add x1, x1, x8
+ b 6f
+
+6: // End of one vertical slice.
+ subs w2, w2, #8
+ b.le 0f
+ // Move pointers back up to the top and loop horizontally.
+ // Input pointers
+ msub x5, x7, x11, x5
+ msub x6, x8, x11, x6
+ // Output pointers
+ msub x0, x7, x10, x0
+ msub x1, x8, x10, x1
+ add x0, x0, #32
+ add x1, x1, #16
+ add x5, x5, #32
+ add x6, x6, #16
+ mov w3, w9
+ b 1b
+
+0:
+ ret
+.purgem add5
+endfunc
+
+// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+// const int w, const int h, const int strength);
+// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+// const int w, const int h, const int strength);
+function sgr_calc_ab1_neon, export=1
+ add x3, x3, #2 // h += 2
+ movi v31.4s, #9 // n
+ mov x5, #455
+ mov x8, #SUM_STRIDE
+ b sgr_calc_ab_neon
+endfunc
+
+function sgr_calc_ab2_neon, export=1
+ add x3, x3, #3 // h += 3
+ asr x3, x3, #1 // h /= 2
+ movi v31.4s, #25 // n
+ mov x5, #164
+ mov x8, #(2*SUM_STRIDE)
+endfunc
+
+function sgr_calc_ab_neon
+ movrel x12, X(sgr_x_by_x)
+ ld1 {v16.16b, v17.16b, v18.16b}, [x12]
+ movi v19.16b, #5
+ movi v20.8b, #55 // idx of last 5
+ movi v21.8b, #72 // idx of last 4
+ movi v22.8b, #101 // idx of last 3
+ movi v23.8b, #169 // idx of last 2
+ movi v24.8b, #254 // idx of last 1
+ add x2, x2, #2 // w += 2
+ add x7, x2, #7
+ bic x7, x7, #7 // aligned w
+ sub x7, x8, x7 // increment between rows
+ movi v29.8h, #1, lsl #8
+ dup v28.4s, w4
+ dup v30.4s, w5 // one_by_x
+ sub x0, x0, #(4*(SUM_STRIDE))
+ sub x1, x1, #(2*(SUM_STRIDE))
+ mov x6, x2 // backup of w
+ sub v16.16b, v16.16b, v19.16b
+ sub v17.16b, v17.16b, v19.16b
+ sub v18.16b, v18.16b, v19.16b
+1:
+ subs x2, x2, #8
+ ld1 {v0.4s, v1.4s}, [x0] // a
+ ld1 {v2.8h}, [x1] // b
+ mul v0.4s, v0.4s, v31.4s // a * n
+ mul v1.4s, v1.4s, v31.4s // a * n
+ umull v3.4s, v2.4h, v2.4h // b * b
+ umull2 v4.4s, v2.8h, v2.8h // b * b
+ uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
+ uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
+ mul v0.4s, v0.4s, v28.4s // p * s
+ mul v1.4s, v1.4s, v28.4s // p * s
+ uqshrn v0.4h, v0.4s, #16
+ uqshrn2 v0.8h, v1.4s, #16
+ uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
+
+ cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
+ cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
+ tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
+ cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
+ cmhi v5.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
+ add v25.8b, v25.8b, v26.8b
+ cmhi v6.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
+ add v27.8b, v27.8b, v5.8b
+ add v6.8b, v6.8b, v19.8b
+ add v25.8b, v25.8b, v27.8b
+ add v1.8b, v1.8b, v6.8b
+ add v1.8b, v1.8b, v25.8b
+ uxtl v1.8h, v1.8b // x
+
+ umull v3.4s, v1.4h, v2.4h // x * BB[i]
+ umull2 v4.4s, v1.8h, v2.8h // x * BB[i]
+ mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
+ mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
+ srshr v3.4s, v3.4s, #12 // AA[i]
+ srshr v4.4s, v4.4s, #12 // AA[i]
+ sub v2.8h, v29.8h, v1.8h // 256 - x
+
+ st1 {v3.4s, v4.4s}, [x0], #32
+ st1 {v2.8h}, [x1], #16
+ b.gt 1b
+
+ subs x3, x3, #1
+ b.le 0f
+ add x0, x0, x7, lsl #2
+ add x1, x1, x7, lsl #1
+ mov x2, x6
+ b 1b
+0:
+ ret
+endfunc
+
+#define FILTER_OUT_STRIDE 384
+
+// void dav1d_sgr_finish_filter1_neon(coef *tmp,
+// const pixel *src, const ptrdiff_t stride,
+// const int32_t *a, const int16_t *b,
+// const int w, const int h);
+function sgr_finish_filter1_neon, export=1
+ sub x7, x3, #(4*SUM_STRIDE)
+ add x8, x3, #(4*SUM_STRIDE)
+ sub x9, x4, #(2*SUM_STRIDE)
+ add x10, x4, #(2*SUM_STRIDE)
+ mov x11, #SUM_STRIDE
+ mov x12, #FILTER_OUT_STRIDE
+ add x13, x5, #7
+ bic x13, x13, #7 // Aligned width
+ sub x2, x2, x13
+ sub x12, x12, x13
+ sub x11, x11, x13
+ sub x11, x11, #4 // We read 4 extra elements from a
+ sub x14, x11, #4 // We read 8 extra elements from b
+ mov x13, x5
+ movi v6.8h, #3
+ movi v7.4s, #3
+1:
+ ld1 {v0.8h, v1.8h}, [x9], #32
+ ld1 {v2.8h, v3.8h}, [x4], #32
+ ld1 {v4.8h, v5.8h}, [x10], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48
+ ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48
+ ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48
+
+2:
+ subs x5, x5, #8
+ ext v25.16b, v0.16b, v1.16b, #2 // -stride
+ ext v26.16b, v2.16b, v3.16b, #2 // 0
+ ext v27.16b, v4.16b, v5.16b, #2 // +stride
+ ext v28.16b, v0.16b, v1.16b, #4 // +1-stride
+ ext v29.16b, v2.16b, v3.16b, #4 // +1
+ ext v30.16b, v4.16b, v5.16b, #4 // +1+stride
+ add v2.8h, v2.8h, v25.8h // -1, -stride
+ add v26.8h, v26.8h, v27.8h // 0, +stride
+ add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride
+ add v2.8h, v2.8h, v26.8h
+ add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride
+ add v2.8h, v2.8h, v29.8h // +1
+ add v0.8h, v0.8h, v4.8h
+
+ ext v25.16b, v16.16b, v17.16b, #4 // -stride
+ ext v26.16b, v17.16b, v18.16b, #4
+ shl v2.8h, v2.8h, #2
+ ext v27.16b, v16.16b, v17.16b, #8 // +1-stride
+ ext v28.16b, v17.16b, v18.16b, #8
+ ext v29.16b, v19.16b, v20.16b, #4 // 0
+ ext v30.16b, v20.16b, v21.16b, #4
+ mla v2.8h, v0.8h, v6.8h // * 3 -> a
+ add v25.4s, v25.4s, v19.4s // -stride, -1
+ add v26.4s, v26.4s, v20.4s
+ add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride
+ add v17.4s, v17.4s, v28.4s
+ ext v27.16b, v19.16b, v20.16b, #8 // +1
+ ext v28.16b, v20.16b, v21.16b, #8
+ add v16.4s, v16.4s, v22.4s // -1+stride
+ add v17.4s, v17.4s, v23.4s
+ add v29.4s, v29.4s, v27.4s // 0, +1
+ add v30.4s, v30.4s, v28.4s
+ add v25.4s, v25.4s, v29.4s
+ add v26.4s, v26.4s, v30.4s
+ ext v27.16b, v22.16b, v23.16b, #4 // +stride
+ ext v28.16b, v23.16b, v24.16b, #4
+ ext v29.16b, v22.16b, v23.16b, #8 // +1+stride
+ ext v30.16b, v23.16b, v24.16b, #8
+ ld1 {v19.8b}, [x1], #8 // src
+ add v25.4s, v25.4s, v27.4s // +stride
+ add v26.4s, v26.4s, v28.4s
+ add v16.4s, v16.4s, v29.4s // +1+stride
+ add v17.4s, v17.4s, v30.4s
+ shl v25.4s, v25.4s, #2
+ shl v26.4s, v26.4s, #2
+ mla v25.4s, v16.4s, v7.4s // * 3 -> b
+ mla v26.4s, v17.4s, v7.4s
+ uxtl v19.8h, v19.8b // src
+ mov v0.16b, v1.16b
+ umlal v25.4s, v2.4h, v19.4h // b + a * src
+ umlal2 v26.4s, v2.8h, v19.8h
+ mov v2.16b, v3.16b
+ rshrn v25.4h, v25.4s, #9
+ rshrn2 v25.8h, v26.4s, #9
+ mov v4.16b, v5.16b
+ st1 {v25.8h}, [x0], #16
+
+ b.le 3f
+ mov v16.16b, v18.16b
+ mov v19.16b, v21.16b
+ mov v22.16b, v24.16b
+ ld1 {v1.8h}, [x9], #16
+ ld1 {v3.8h}, [x4], #16
+ ld1 {v5.8h}, [x10], #16
+ ld1 {v17.4s, v18.4s}, [x7], #32
+ ld1 {v20.4s, v21.4s}, [x3], #32
+ ld1 {v23.4s, v24.4s}, [x8], #32
+ b 2b
+
+3:
+ subs x6, x6, #1
+ b.le 0f
+ mov x5, x13
+ add x0, x0, x12, lsl #1
+ add x1, x1, x2
+ add x3, x3, x11, lsl #2
+ add x7, x7, x11, lsl #2
+ add x8, x8, x11, lsl #2
+ add x4, x4, x14, lsl #1
+ add x9, x9, x14, lsl #1
+ add x10, x10, x14, lsl #1
+ b 1b
+0:
+ ret
+endfunc
+
+// void dav1d_sgr_finish_filter2_neon(coef *tmp,
+// const pixel *src, const ptrdiff_t stride,
+// const int32_t *a, const int16_t *b,
+// const int w, const int h);
+function sgr_finish_filter2_neon, export=1
+ add x7, x3, #(4*(SUM_STRIDE))
+ sub x3, x3, #(4*(SUM_STRIDE))
+ add x8, x4, #(2*(SUM_STRIDE))
+ sub x4, x4, #(2*(SUM_STRIDE))
+ mov x9, #(2*SUM_STRIDE)
+ mov x10, #FILTER_OUT_STRIDE
+ add x11, x5, #7
+ bic x11, x11, #7 // Aligned width
+ sub x2, x2, x11
+ sub x10, x10, x11
+ sub x9, x9, x11
+ sub x9, x9, #4 // We read 4 extra elements from a
+ sub x12, x9, #4 // We read 8 extra elements from b
+ mov x11, x5
+ movi v4.8h, #5
+ movi v5.4s, #5
+ movi v6.8h, #6
+ movi v7.4s, #6
+1:
+ ld1 {v0.8h, v1.8h}, [x4], #32
+ ld1 {v2.8h, v3.8h}, [x8], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
+ ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48
+
+2:
+ subs x5, x5, #8
+ ext v22.16b, v0.16b, v1.16b, #2 // -stride
+ ext v23.16b, v2.16b, v3.16b, #2 // +stride
+ ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
+ ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
+ add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
+ add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
+ add v0.8h, v0.8h, v25.8h
+ add v2.8h, v22.8h, v23.8h // -stride, +stride
+
+ ext v22.16b, v16.16b, v17.16b, #4 // -stride
+ ext v23.16b, v17.16b, v18.16b, #4
+ ext v24.16b, v19.16b, v20.16b, #4 // +stride
+ ext v25.16b, v20.16b, v21.16b, #4
+ ext v26.16b, v16.16b, v17.16b, #8 // +1-stride
+ ext v27.16b, v17.16b, v18.16b, #8
+ ext v28.16b, v19.16b, v20.16b, #8 // +1+stride
+ ext v29.16b, v20.16b, v21.16b, #8
+ mul v0.8h, v0.8h, v4.8h // * 5
+ mla v0.8h, v2.8h, v6.8h // * 6
+ ld1 {v31.8b}, [x1], #8
+ add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride
+ add v17.4s, v17.4s, v27.4s
+ add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride
+ add v20.4s, v20.4s, v29.4s
+ add v16.4s, v16.4s, v19.4s
+ add v17.4s, v17.4s, v20.4s
+
+ add v22.4s, v22.4s, v24.4s // -stride, +stride
+ add v23.4s, v23.4s, v25.4s
+ // This is, surprisingly, faster than other variants where the
+ // mul+mla pairs are further apart, on Cortex A53.
+ mul v16.4s, v16.4s, v5.4s // * 5
+ mla v16.4s, v22.4s, v7.4s // * 6
+ mul v17.4s, v17.4s, v5.4s // * 5
+ mla v17.4s, v23.4s, v7.4s // * 6
+
+ uxtl v31.8h, v31.8b
+ umlal v16.4s, v0.4h, v31.4h // b + a * src
+ umlal2 v17.4s, v0.8h, v31.8h
+ mov v0.16b, v1.16b
+ rshrn v16.4h, v16.4s, #9
+ rshrn2 v16.8h, v17.4s, #9
+ mov v2.16b, v3.16b
+ st1 {v16.8h}, [x0], #16
+
+ b.le 3f
+ mov v16.16b, v18.16b
+ mov v19.16b, v21.16b
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v3.8h}, [x8], #16
+ ld1 {v17.4s, v18.4s}, [x3], #32
+ ld1 {v20.4s, v21.4s}, [x7], #32
+ b 2b
+
+3:
+ subs x6, x6, #1
+ b.le 0f
+ mov x5, x11
+ add x0, x0, x10, lsl #1
+ add x1, x1, x2
+ add x3, x3, x9, lsl #2
+ add x7, x7, x9, lsl #2
+ add x4, x4, x12, lsl #1
+ add x8, x8, x12, lsl #1
+ mov x13, x3
+ mov x14, x4
+
+ ld1 {v0.8h, v1.8h}, [x4], #32
+ ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48
+
+4:
+ subs x5, x5, #8
+ ext v22.16b, v0.16b, v1.16b, #2 // 0
+ ext v23.16b, v0.16b, v1.16b, #4 // +1
+ add v0.8h, v0.8h, v23.8h // -1, +1
+
+ ext v24.16b, v16.16b, v17.16b, #4 // 0
+ ext v25.16b, v17.16b, v18.16b, #4
+ ext v26.16b, v16.16b, v17.16b, #8 // +1
+ ext v27.16b, v17.16b, v18.16b, #8
+ mul v2.8h, v22.8h, v6.8h // * 6
+ mla v2.8h, v0.8h, v4.8h // * 5 -> a
+ ld1 {v31.8b}, [x1], #8
+ add v16.4s, v16.4s, v26.4s // -1, +1
+ add v17.4s, v17.4s, v27.4s
+ uxtl v31.8h, v31.8b
+ // This is, surprisingly, faster than other variants where the
+ // mul+mla pairs are further apart, on Cortex A53.
+ mul v24.4s, v24.4s, v7.4s // * 6
+ mla v24.4s, v16.4s, v5.4s // * 5 -> b
+ mul v25.4s, v25.4s, v7.4s // * 6
+ mla v25.4s, v17.4s, v5.4s // * 5 -> b
+
+ umlal v24.4s, v2.4h, v31.4h // b + a * src
+ umlal2 v25.4s, v2.8h, v31.8h
+ mov v0.16b, v1.16b
+ rshrn v24.4h, v24.4s, #8
+ rshrn2 v24.8h, v25.4s, #8
+ mov v16.16b, v18.16b
+ st1 {v24.8h}, [x0], #16
+
+ b.le 5f
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v17.4s, v18.4s}, [x3], #32
+ b 4b
+
+5:
+ subs x6, x6, #1
+ b.le 0f
+ mov x5, x11
+ add x0, x0, x10, lsl #1
+ add x1, x1, x2
+ mov x3, x13 // Rewind x3/x4 to where they started
+ mov x4, x14
+ b 1b
+0:
+ ret
+endfunc
+
+// void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const coef *t1, const int w, const int h,
+// const int wt);
+function sgr_weighted1_neon, export=1
+ dup v31.8h, w7
+ cmp x6, #2
+ add x9, x0, x1
+ add x10, x2, x3
+ add x11, x4, #2*FILTER_OUT_STRIDE
+ mov x7, #(4*FILTER_OUT_STRIDE)
+ lsl x1, x1, #1
+ lsl x3, x3, #1
+ add x8, x5, #7
+ bic x8, x8, #7 // Aligned width
+ sub x1, x1, x8
+ sub x3, x3, x8
+ sub x7, x7, x8, lsl #1
+ mov x8, x5
+ b.lt 2f
+1:
+ ld1 {v0.8b}, [x2], #8
+ ld1 {v4.8b}, [x10], #8
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v5.8h}, [x11], #16
+ subs x5, x5, #8
+ ushll v0.8h, v0.8b, #4 // u
+ ushll v4.8h, v4.8b, #4 // u
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ sub v5.8h, v5.8h, v4.8h // t1 - u
+ ushll v2.4s, v0.4h, #7 // u << 7
+ ushll2 v3.4s, v0.8h, #7 // u << 7
+ ushll v6.4s, v4.4h, #7 // u << 7
+ ushll2 v7.4s, v4.8h, #7 // u << 7
+ smlal v2.4s, v1.4h, v31.4h // v
+ smlal2 v3.4s, v1.8h, v31.8h // v
+ smlal v6.4s, v5.4h, v31.4h // v
+ smlal2 v7.4s, v5.8h, v31.8h // v
+ rshrn v2.4h, v2.4s, #11
+ rshrn2 v2.8h, v3.4s, #11
+ rshrn v6.4h, v6.4s, #11
+ rshrn2 v6.8h, v7.4s, #11
+ sqxtun v2.8b, v2.8h
+ sqxtun v6.8b, v6.8h
+ st1 {v2.8b}, [x0], #8
+ st1 {v6.8b}, [x9], #8
+ b.gt 1b
+
+ sub x6, x6, #2
+ cmp x6, #1
+ b.lt 0f
+ mov x5, x8
+ add x0, x0, x1
+ add x9, x9, x1
+ add x2, x2, x3
+ add x10, x10, x3
+ add x4, x4, x7
+ add x11, x11, x7
+ b.eq 2f
+ b 1b
+
+2:
+ ld1 {v0.8b}, [x2], #8
+ ld1 {v1.8h}, [x4], #16
+ subs x5, x5, #8
+ ushll v0.8h, v0.8b, #4 // u
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ ushll v2.4s, v0.4h, #7 // u << 7
+ ushll2 v3.4s, v0.8h, #7 // u << 7
+ smlal v2.4s, v1.4h, v31.4h // v
+ smlal2 v3.4s, v1.8h, v31.8h // v
+ rshrn v2.4h, v2.4s, #11
+ rshrn2 v2.8h, v3.4s, #11
+ sqxtun v2.8b, v2.8h
+ st1 {v2.8b}, [x0], #8
+ b.gt 2b
+0:
+ ret
+endfunc
+
+// void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t stride,
+// const pixel *src, const ptrdiff_t src_stride,
+// const coef *t1, const coef *t2,
+// const int w, const int h,
+// const int16_t wt[2]);
+function sgr_weighted2_neon, export=1
+ ldr x8, [sp]
+ ld1 {v31.s}[0], [x8]
+ cmp x7, #2
+ add x10, x0, x1
+ add x11, x2, x3
+ add x12, x4, #2*FILTER_OUT_STRIDE
+ add x13, x5, #2*FILTER_OUT_STRIDE
+ mov x8, #4*FILTER_OUT_STRIDE
+ lsl x1, x1, #1
+ lsl x3, x3, #1
+ add x9, x6, #7
+ bic x9, x9, #7 // Aligned width
+ sub x1, x1, x9
+ sub x3, x3, x9
+ sub x8, x8, x9, lsl #1
+ dup v30.8h, v31.h[0] // wt[0]
+ dup v31.8h, v31.h[1] // wt[1]
+ mov x9, x6
+ b.lt 2f
+1:
+ ld1 {v0.8b}, [x2], #8
+ ld1 {v16.8b}, [x11], #8
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v17.8h}, [x12], #16
+ ld1 {v2.8h}, [x5], #16
+ ld1 {v18.8h}, [x13], #16
+ subs x6, x6, #8
+ ushll v0.8h, v0.8b, #4 // u
+ ushll v16.8h, v16.8b, #4 // u
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ sub v2.8h, v2.8h, v0.8h // t2 - u
+ sub v17.8h, v17.8h, v16.8h // t1 - u
+ sub v18.8h, v18.8h, v16.8h // t2 - u
+ ushll v3.4s, v0.4h, #7 // u << 7
+ ushll2 v4.4s, v0.8h, #7 // u << 7
+ ushll v19.4s, v16.4h, #7 // u << 7
+ ushll2 v20.4s, v16.8h, #7 // u << 7
+ smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
+ smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
+ smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
+ smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
+ smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)
+ smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
+ smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
+ smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
+ rshrn v3.4h, v3.4s, #11
+ rshrn2 v3.8h, v4.4s, #11
+ rshrn v19.4h, v19.4s, #11
+ rshrn2 v19.8h, v20.4s, #11
+ sqxtun v3.8b, v3.8h
+ sqxtun v19.8b, v19.8h
+ st1 {v3.8b}, [x0], #8
+ st1 {v19.8b}, [x10], #8
+ b.gt 1b
+
+ subs x7, x7, #2
+ cmp x7, #1
+ b.lt 0f
+ mov x6, x9
+ add x0, x0, x1
+ add x10, x10, x1
+ add x2, x2, x3
+ add x11, x11, x3
+ add x4, x4, x8
+ add x12, x12, x8
+ add x5, x5, x8
+ add x13, x13, x8
+ b.eq 2f
+ b 1b
+
+2:
+ ld1 {v0.8b}, [x2], #8
+ ld1 {v1.8h}, [x4], #16
+ ld1 {v2.8h}, [x5], #16
+ subs x6, x6, #8
+ ushll v0.8h, v0.8b, #4 // u
+ sub v1.8h, v1.8h, v0.8h // t1 - u
+ sub v2.8h, v2.8h, v0.8h // t2 - u
+ ushll v3.4s, v0.4h, #7 // u << 7
+ ushll2 v4.4s, v0.8h, #7 // u << 7
+ smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u)
+ smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u)
+ smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u)
+ smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u)
+ rshrn v3.4h, v3.4s, #11
+ rshrn2 v3.8h, v4.4s, #11
+ sqxtun v3.8b, v3.8h
+ st1 {v3.8b}, [x0], #8
+ b.gt 1b
+0:
+ ret
+endfunc