From e3dbf92664918ecc830b4fde74b7cc0f6cd2065c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Mon, 10 Feb 2020 10:03:27 +0200
Subject: arm64: looprestoration: NEON implementation of SGR for 10 bpc

This only supports 10 bpc, not 12 bpc, as the sum and tmp buffers can
be int16_t for 10 bpc, but need to be int32_t for 12 bpc.

Make actual templates out of the functions in looprestoration_tmpl.S,
and add box3/5_h to looprestoration16.S.

Extend dav1d_sgr_calc_abX_neon with a mandatory bitdepth_max parameter
(which is passed even in 8bpc mode), add a define to bitdepth.h for
passing such a parameter in all modes. This makes this function
a few instructions slower in 8bpc mode than it was before (overall impact
seems to be around 1% of the total runtime of SGR), but allows using the
same actual function instantiation for all modes, saving a bit of code
size.

Examples of checkasm runtimes:
                           Cortex A53        A72        A73
selfguided_3x3_10bpc_neon:   516755.8   389412.7   349058.7
selfguided_5x5_10bpc_neon:   380699.9   293486.6   254591.6
selfguided_mix_10bpc_neon:   878142.3   667495.9   587844.6

Corresponding 8 bpc numbers for comparison:
selfguided_3x3_8bpc_neon:    491058.1   361473.4   347705.9
selfguided_5x5_8bpc_neon:    352655.0   266423.7   248192.2
selfguided_mix_8bpc_neon:    826094.1   612372.2   581943.1
---
 src/arm/64/looprestoration.S        |   2 +
 src/arm/64/looprestoration16.S      | 559 ++++++++++++++++++++++++++++++++++++
 src/arm/64/looprestoration_common.S |  28 +-
 src/arm/64/looprestoration_tmpl.S   | 141 ++++++++-
 src/arm/looprestoration_init_tmpl.c |  17 +-
 5 files changed, 720 insertions(+), 27 deletions(-)

(limited to 'src/arm')

diff --git a/src/arm/64/looprestoration.S b/src/arm/64/looprestoration.S
index b6f0934..1e864c2 100644
--- a/src/arm/64/looprestoration.S
+++ b/src/arm/64/looprestoration.S
@@ -1148,3 +1148,5 @@ L(box5_variable_shift_tbl):
         ret
 .purgem add5
 endfunc
+
+sgr_funcs 8
diff --git a/src/arm/64/looprestoration16.S b/src/arm/64/looprestoration16.S
index f792a1d..95f24fc 100644
--- a/src/arm/64/looprestoration16.S
+++ b/src/arm/64/looprestoration16.S
@@ -678,3 +678,562 @@ L(copy_narrow_tbl):
         .hword L(copy_narrow_tbl) - 60b
         .hword L(copy_narrow_tbl) - 70b
 endfunc
+
+#define SUM_STRIDE (384+16)
+
+#include "looprestoration_tmpl.S"
+
+// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                  const pixel (*left)[4],
+//                                  const pixel *src, const ptrdiff_t stride,
+//                                  const int w, const int h,
+//                                  const enum LrEdgeFlags edges);
+function sgr_box3_h_16bpc_neon, export=1
+        add             w5,  w5,  #2 // w += 2
+
+        // Set up pointers for reading/writing alternate rows
+        add             x10, x0,  #(4*SUM_STRIDE)   // sumsq
+        add             x11, x1,  #(2*SUM_STRIDE)   // sum
+        add             x12, x3,  x4                // src
+        lsl             x4,  x4,  #1
+        mov             x9,       #(2*2*SUM_STRIDE) // double sum stride
+
+        // Subtract the aligned width from the output stride.
+        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            0f
+        // !LR_HAVE_RIGHT
+        add             w13, w5,  #3
+        bic             w13, w13, #3
+        b               1f
+0:
+        add             w13, w5,  #7
+        bic             w13, w13, #7
+1:
+        sub             x9,  x9,  w13, uxtw #1
+
+        // Store the width for the vertical loop
+        mov             w8,  w5
+
+        // Subtract the number of pixels read from the input from the stride
+        add             w13, w5,  #14
+        bic             w13, w13, #7
+        sub             x4,  x4,  w13, uxtw #1
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            2f
+        // LR_HAVE_LEFT
+        cbnz            x2,  0f
+        // left == NULL
+        sub             x3,  x3,  #4
+        sub             x12, x12, #4
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 2 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             x4,  x4,  #4
+
+
+1:      // Loop vertically
+        ld1             {v0.8h, v1.8h},   [x3],  #32
+        ld1             {v16.8h, v17.8h}, [x12], #32
+
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            0f
+        cbz             x2,  2f
+        // LR_HAVE_LEFT, left != NULL
+        ld1             {v2.d}[1],  [x2], #8
+        // Move x3/x12 back to account for the last 2 pixels we loaded earlier,
+        // which we'll shift out.
+        sub             x3,  x3,  #4
+        sub             x12, x12, #4
+        ld1             {v18.d}[1], [x2], #8
+        ext             v1.16b,  v0.16b,  v1.16b,  #12
+        ext             v0.16b,  v2.16b,  v0.16b,  #12
+        ext             v17.16b, v16.16b, v17.16b, #12
+        ext             v16.16b, v18.16b, v16.16b, #12
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+        // and shift v0/v1 to have 2x the first pixel at the front.
+        dup             v2.8h,  v0.h[0]
+        dup             v18.8h, v16.h[0]
+        // Move x3 back to account for the last 2 pixels we loaded before,
+        // which we shifted out.
+        sub             x3,  x3,  #4
+        sub             x12, x12, #4
+        ext             v1.16b,  v0.16b,  v1.16b,  #12
+        ext             v0.16b,  v2.16b,  v0.16b,  #12
+        ext             v17.16b, v16.16b, v17.16b, #12
+        ext             v16.16b, v18.16b, v16.16b, #12
+
+2:
+        umull           v2.4s,   v0.4h,   v0.4h
+        umull2          v3.4s,   v0.8h,   v0.8h
+        umull           v4.4s,   v1.4h,   v1.4h
+        umull           v18.4s,  v16.4h,  v16.4h
+        umull2          v19.4s,  v16.8h,  v16.8h
+        umull           v20.4s,  v17.4h,  v17.4h
+
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             w13, w5, #(2 + 16 - 2 + 1)
+        ldr             h30, [x3,  w13, sxtw #1]
+        ldr             h31, [x12, w13, sxtw #1]
+        // Fill v30/v31 with the right padding pixel
+        dup             v30.8h,  v30.h[0]
+        dup             v31.8h,  v31.h[0]
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             w5,  #10
+        b.ge            4f   // If w >= 10, all used input pixels are valid
+        cmp             w5,  #6
+        b.ge            5f   // If w >= 6, we can filter 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro ext_n            dst1, dst2, src1, src2, src3, n, w
+        ext             \dst1,  \src1,  \src2,  \n
+.if \w > 4
+        ext             \dst2,  \src2,  \src3,  \n
+.endif
+.endm
+.macro add_n            dst1, dst2, src1, src2, src3, src4, w
+        add             \dst1,  \src1,  \src3
+.if \w > 4
+        add             \dst2,  \src2,  \src4
+.endif
+.endm
+
+.macro add3 w, wd
+        ext             v24.16b, v0.16b,  v1.16b,  #2
+        ext             v25.16b, v0.16b,  v1.16b,  #4
+        ext             v26.16b, v16.16b, v17.16b, #2
+        ext             v27.16b, v16.16b, v17.16b, #4
+        add             v6\wd,   v0\wd,   v24\wd
+        add             v7\wd,   v16\wd,  v26\wd
+        add             v6\wd,   v6\wd,   v25\wd
+        add             v7\wd,   v7\wd,   v27\wd
+
+        ext_n           v24.16b, v25.16b, v2.16b,  v3.16b,  v4.16b,  #4, \w
+        ext_n           v26.16b, v27.16b, v2.16b,  v3.16b,  v4.16b,  #8, \w
+
+        add_n           v22.4s,  v23.4s,  v2.4s,   v3.4s,   v24.4s,  v25.4s,  \w
+        add_n           v22.4s,  v23.4s,  v22.4s,  v23.4s,  v26.4s,  v27.4s,  \w
+
+        ext_n           v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4, \w
+        ext_n           v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8, \w
+
+        add_n           v24.4s,  v25.4s,  v18.4s,  v19.4s,  v24.4s,  v25.4s,  \w
+        add_n           v24.4s,  v25.4s,  v24.4s,  v25.4s,  v26.4s,  v27.4s,  \w
+.endm
+        add3            8, .8h
+        st1             {v6.8h},         [x1],  #16
+        st1             {v7.8h},         [x11], #16
+        st1             {v22.4s,v23.4s}, [x0],  #32
+        st1             {v24.4s,v25.4s}, [x10], #32
+
+        subs            w5,  w5,  #8
+        b.le            9f
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        mov             v0.16b,  v1.16b
+        mov             v16.16b, v17.16b
+        ld1             {v1.8h},  [x3],  #16
+        ld1             {v17.8h}, [x12], #16
+        mov             v2.16b,  v4.16b
+        umull2          v3.4s,   v0.8h,   v0.8h
+        umull           v4.4s,   v1.4h,   v1.4h
+        mov             v18.16b, v20.16b
+        umull2          v19.4s,  v16.8h,  v16.8h
+        umull           v20.4s,  v17.4h,  v17.4h
+
+        b.ne            4b // If we don't need to pad, just keep summing.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Produce 4 pixels, 6 <= w < 10
+        add3            4, .4h
+        st1             {v6.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v22.4s}, [x0],  #16
+        st1             {v24.4s}, [x10], #16
+
+        subs            w5,  w5,  #4 // 2 <= w < 6
+        ext             v0.16b,  v0.16b,  v1.16b,  #8
+        ext             v16.16b, v16.16b, v17.16b, #8
+
+6:      // Pad the right edge and produce the last few pixels.
+        // 2 <= w < 6, 2-5 pixels valid in v0
+        sub             w13,  w5,  #2
+        // w13 = (pixels valid - 2)
+        adr             x14, L(box3_variable_shift_tbl)
+        ldrh            w13, [x14, w13, uxtw #1]
+        sub             x13, x14, w13, uxth
+        br              x13
+        // Shift v0 right, shifting out invalid pixels,
+        // shift v0 left to the original offset, shifting in padding pixels.
+22:     // 2 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #4
+        ext             v16.16b, v16.16b, v16.16b, #4
+        ext             v0.16b,  v0.16b,  v30.16b, #12
+        ext             v16.16b, v16.16b, v31.16b, #12
+        b               88f
+33:     // 3 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #6
+        ext             v16.16b, v16.16b, v16.16b, #6
+        ext             v0.16b,  v0.16b,  v30.16b, #10
+        ext             v16.16b, v16.16b, v31.16b, #10
+        b               88f
+44:     // 4 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #8
+        ext             v16.16b, v16.16b, v16.16b, #8
+        ext             v0.16b,  v0.16b,  v30.16b, #8
+        ext             v16.16b, v16.16b, v31.16b, #8
+        b               88f
+55:     // 5 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #10
+        ext             v16.16b, v16.16b, v16.16b, #10
+        ext             v0.16b,  v0.16b,  v30.16b, #6
+        ext             v16.16b, v16.16b, v31.16b, #6
+        b               88f
+
+L(box3_variable_shift_tbl):
+        .hword L(box3_variable_shift_tbl) - 22b
+        .hword L(box3_variable_shift_tbl) - 33b
+        .hword L(box3_variable_shift_tbl) - 44b
+        .hword L(box3_variable_shift_tbl) - 55b
+
+88:
+        umull           v2.4s,   v0.4h,   v0.4h
+        umull2          v3.4s,   v0.8h,   v0.8h
+        umull           v18.4s,  v16.4h,  v16.4h
+        umull2          v19.4s,  v16.8h,  v16.8h
+
+        add3            4, .4h
+        subs            w5,  w5,  #4
+        st1             {v6.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v22.4s}, [x0],  #16
+        st1             {v24.4s}, [x10], #16
+        b.le            9f
+        ext             v0.16b,  v0.16b,  v0.16b,  #8
+        ext             v16.16b, v16.16b, v16.16b, #8
+        mov             v2.16b,  v3.16b
+        mov             v3.16b,  v4.16b
+        mov             v18.16b, v19.16b
+        mov             v19.16b, v20.16b
+        // Only one needed pixel left, but do a normal 4 pixel
+        // addition anyway
+        add3            4, .4h
+        st1             {v6.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v22.4s}, [x0],  #16
+        st1             {v24.4s}, [x10], #16
+
+9:
+        subs            w6,  w6,  #2
+        b.le            0f
+        // Jump to the next row and loop horizontally
+        add             x0,  x0,  x9, lsl #1
+        add             x10, x10, x9, lsl #1
+        add             x1,  x1,  x9
+        add             x11, x11, x9
+        add             x3,  x3,  x4
+        add             x12, x12, x4
+        mov             w5,  w8
+        b               1b
+0:
+        ret
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
+//                                  const pixel (*left)[4],
+//                                  const pixel *src, const ptrdiff_t stride,
+//                                  const int w, const int h,
+//                                  const enum LrEdgeFlags edges);
+function sgr_box5_h_16bpc_neon, export=1
+        add             w5,  w5,  #2 // w += 2
+
+        // Set up pointers for reading/writing alternate rows
+        add             x10, x0,  #(4*SUM_STRIDE)   // sumsq
+        add             x11, x1,  #(2*SUM_STRIDE)   // sum
+        add             x12, x3,  x4                // src
+        lsl             x4,  x4,  #1
+        mov             x9,       #(2*2*SUM_STRIDE) // double sum stride
+
+        // Subtract the aligned width from the output stride.
+        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
+        // Subtract the number of pixels read from the input from the stride.
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            0f
+        // !LR_HAVE_RIGHT
+        add             w13, w5,  #3
+        bic             w13, w13, #3
+        add             w14, w5,  #13
+        b               1f
+0:
+        add             w13, w5,  #7
+        bic             w13, w13, #7
+        add             w14, w5,  #15
+1:
+        sub             x9,  x9,  w13, uxtw #1
+        bic             w14, w14, #7
+        sub             x4,  x4,  w14, uxtw #1
+
+        // Store the width for the vertical loop
+        mov             w8,  w5
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            2f
+        // LR_HAVE_LEFT
+        cbnz            x2,  0f
+        // left == NULL
+        sub             x3,  x3,  #6
+        sub             x12, x12, #6
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 3 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             x4,  x4,  #6
+
+1:      // Loop vertically
+        ld1             {v0.8h, v1.8h},   [x3],  #32
+        ld1             {v16.8h, v17.8h}, [x12], #32
+
+        tst             w7,  #1 // LR_HAVE_LEFT
+        b.eq            0f
+        cbz             x2,  2f
+        // LR_HAVE_LEFT, left != NULL
+        ld1             {v2.d}[1],  [x2], #8
+        // Move x3/x12 back to account for the last 3 pixels we loaded earlier,
+        // which we'll shift out.
+        sub             x3,  x3,  #6
+        sub             x12, x12, #6
+        ld1             {v18.d}[1],  [x2], #8
+        ext             v1.16b,  v0.16b,  v1.16b,  #10
+        ext             v0.16b,  v2.16b,  v0.16b,  #10
+        ext             v17.16b, v16.16b, v17.16b, #10
+        ext             v16.16b, v18.16b, v16.16b, #10
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
+        // and shift v0/v1 to have 3x the first pixel at the front.
+        dup             v2.8h,  v0.h[0]
+        dup             v18.8h, v16.h[0]
+        // Move x3 back to account for the last 6 bytes we loaded before,
+        // which we shifted out.
+        sub             x3,  x3,  #6
+        sub             x12, x12, #6
+        ext             v1.16b,  v0.16b,  v1.16b,  #10
+        ext             v0.16b,  v2.16b,  v0.16b,  #10
+        ext             v17.16b, v16.16b, v17.16b, #10
+        ext             v16.16b, v18.16b, v16.16b, #10
+
+2:
+        umull           v2.4s,   v0.4h,   v0.4h
+        umull2          v3.4s,   v0.8h,   v0.8h
+        umull           v4.4s,   v1.4h,   v1.4h
+        umull           v18.4s,  v16.4h,  v16.4h
+        umull2          v19.4s,  v16.8h,  v16.8h
+        umull           v20.4s,  v17.4h,  v17.4h
+
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        b.ne            4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             w13, w5, #(2 + 16 - 3 + 1)
+        ldr             h30, [x3,  w13, sxtw #1]
+        ldr             h31, [x12, w13, sxtw #1]
+        // Fill v30/v31 with the right padding pixel
+        dup             v30.8h,  v30.h[0]
+        dup             v31.8h,  v31.h[0]
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             w5,  #11
+        b.ge            4f   // If w >= 11, all used input pixels are valid
+        cmp             w5,  #7
+        b.ge            5f   // If w >= 7, we can produce 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro add5 w, wd
+        ext             v24.16b, v0.16b,  v1.16b,  #2
+        ext             v25.16b, v0.16b,  v1.16b,  #4
+        ext             v26.16b, v0.16b,  v1.16b,  #6
+        ext             v27.16b, v0.16b,  v1.16b,  #8
+
+        add             v6\wd,   v0\wd,   v24\wd
+        add             v25\wd,  v25\wd,  v26\wd
+        add             v6\wd,   v6\wd,   v27\wd
+
+        ext             v26.16b, v16.16b, v17.16b, #2
+        ext             v27.16b, v16.16b, v17.16b, #4
+        ext             v28.16b, v16.16b, v17.16b, #6
+        ext             v29.16b, v16.16b, v17.16b, #8
+
+        add             v7\wd,   v16\wd,  v26\wd
+        add             v27\wd,  v27\wd,  v28\wd
+        add             v7\wd,   v7\wd,   v29\wd
+        add             v6\wd,   v6\wd,   v25\wd
+        add             v7\wd,   v7\wd,   v27\wd
+
+        ext_n           v24.16b, v25.16b, v2.16b,  v3.16b,  v4.16b,  #4,  \w
+        ext_n           v26.16b, v27.16b, v2.16b,  v3.16b,  v4.16b,  #8,  \w
+        ext_n           v28.16b, v29.16b, v2.16b,  v3.16b,  v4.16b,  #12, \w
+
+        add_n           v22.4s,  v23.4s,  v2.4s,   v3.4s,   v24.4s,  v25.4s,  \w
+        add_n           v26.4s,  v27.4s,  v26.4s,  v27.4s,  v28.4s,  v29.4s,  \w
+        add_n           v22.4s,  v23.4s,  v22.4s,  v23.4s,  v3.4s,   v4.4s,   \w
+        add_n           v22.4s,  v23.4s,  v22.4s,  v23.4s,  v26.4s,  v27.4s,  \w
+
+        ext_n           v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4,  \w
+        ext_n           v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8,  \w
+        ext_n           v28.16b, v29.16b, v18.16b, v19.16b, v20.16b, #12, \w
+
+        add_n           v24.4s,  v25.4s,  v18.4s,  v19.4s,  v24.4s,  v25.4s,  \w
+        add_n           v26.4s,  v27.4s,  v26.4s,  v27.4s,  v28.4s,  v29.4s,  \w
+        add_n           v24.4s,  v25.4s,  v24.4s,  v25.4s,  v19.4s,  v20.4s,  \w
+        add_n           v24.4s,  v25.4s,  v24.4s,  v25.4s,  v26.4s,  v27.4s,  \w
+.endm
+        add5            8, .8h
+        st1             {v6.8h},         [x1],  #16
+        st1             {v7.8h},         [x11], #16
+        st1             {v22.4s,v23.4s}, [x0],  #32
+        st1             {v24.4s,v25.4s}, [x10], #32
+
+        subs            w5,  w5,  #8
+        b.le            9f
+        tst             w7,  #2 // LR_HAVE_RIGHT
+        mov             v0.16b,  v1.16b
+        mov             v16.16b, v17.16b
+        ld1             {v1.8h},  [x3],  #16
+        ld1             {v17.8h}, [x12], #16
+        mov             v2.16b,  v4.16b
+        umull2          v3.4s,   v0.8h,   v0.8h
+        umull           v4.4s,   v1.4h,   v1.4h
+        mov             v18.16b, v20.16b
+        umull2          v19.4s,  v16.8h,  v16.8h
+        umull           v20.4s,  v17.4h,  v17.4h
+
+        b.ne            4b // If we don't need to pad, just keep summing.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Produce 4 pixels, 7 <= w < 11
+        add5            4, .4h
+        st1             {v6.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v22.4s}, [x0],  #16
+        st1             {v24.4s}, [x10], #16
+
+        subs            w5,  w5,  #4 // 3 <= w < 7
+        ext             v0.16b,  v0.16b,  v1.16b,  #8
+        ext             v16.16b, v16.16b, v17.16b, #8
+
+6:      // Pad the right edge and produce the last few pixels.
+        // w < 7, w+1 pixels valid in v0/v4
+        sub             w13,  w5,  #1
+        // w13 = pixels valid - 2
+        adr             x14, L(box5_variable_shift_tbl)
+        ldrh            w13, [x14, w13, uxtw #1]
+        mov             v1.16b,  v30.16b
+        mov             v17.16b, v31.16b
+        sub             x13, x14, w13, uxth
+        br              x13
+        // Shift v0 right, shifting out invalid pixels,
+        // shift v0 left to the original offset, shifting in padding pixels.
+22:     // 2 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #4
+        ext             v16.16b, v16.16b, v16.16b, #4
+        ext             v0.16b,  v0.16b,  v30.16b, #12
+        ext             v16.16b, v16.16b, v31.16b, #12
+        b               88f
+33:     // 3 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #6
+        ext             v16.16b, v16.16b, v16.16b, #6
+        ext             v0.16b,  v0.16b,  v30.16b, #10
+        ext             v16.16b, v16.16b, v31.16b, #10
+        b               88f
+44:     // 4 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #8
+        ext             v16.16b, v16.16b, v16.16b, #8
+        ext             v0.16b,  v0.16b,  v30.16b, #8
+        ext             v16.16b, v16.16b, v31.16b, #8
+        b               88f
+55:     // 5 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #10
+        ext             v16.16b, v16.16b, v16.16b, #10
+        ext             v0.16b,  v0.16b,  v30.16b, #6
+        ext             v16.16b, v16.16b, v31.16b, #6
+        b               88f
+66:     // 6 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #12
+        ext             v16.16b, v16.16b, v16.16b, #12
+        ext             v0.16b,  v0.16b,  v30.16b, #4
+        ext             v16.16b, v16.16b, v31.16b, #4
+        b               88f
+77:     // 7 pixels valid
+        ext             v0.16b,  v0.16b,  v0.16b,  #14
+        ext             v16.16b, v16.16b, v16.16b, #14
+        ext             v0.16b,  v0.16b,  v30.16b, #2
+        ext             v16.16b, v16.16b, v31.16b, #2
+        b               88f
+
+L(box5_variable_shift_tbl):
+        .hword L(box5_variable_shift_tbl) - 22b
+        .hword L(box5_variable_shift_tbl) - 33b
+        .hword L(box5_variable_shift_tbl) - 44b
+        .hword L(box5_variable_shift_tbl) - 55b
+        .hword L(box5_variable_shift_tbl) - 66b
+        .hword L(box5_variable_shift_tbl) - 77b
+
+88:
+        umull           v2.4s,   v0.4h,   v0.4h
+        umull2          v3.4s,   v0.8h,   v0.8h
+        umull           v4.4s,   v1.4h,   v1.4h
+        umull           v18.4s,  v16.4h,  v16.4h
+        umull2          v19.4s,  v16.8h,  v16.8h
+        umull           v20.4s,  v17.4h,  v17.4h
+
+        add5            4, .4h
+        subs            w5,  w5,  #4
+        st1             {v6.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v22.4s}, [x0],  #16
+        st1             {v24.4s}, [x10], #16
+        b.le            9f
+        ext             v0.16b,  v0.16b,  v1.16b,  #8
+        ext             v16.16b, v16.16b, v17.16b, #8
+        mov             v2.16b,  v3.16b
+        mov             v3.16b,  v4.16b
+        mov             v18.16b, v19.16b
+        mov             v19.16b, v20.16b
+        add5            4, .4h
+        st1             {v6.4h},  [x1],  #8
+        st1             {v7.4h},  [x11], #8
+        st1             {v22.4s}, [x0],  #16
+        st1             {v24.4s}, [x10], #16
+
+9:
+        subs            w6,  w6,  #2
+        b.le            0f
+        // Jump to the next row and loop horizontally
+        add             x0,  x0,  x9, lsl #1
+        add             x10, x10, x9, lsl #1
+        add             x1,  x1,  x9
+        add             x11, x11, x9
+        add             x3,  x3,  x4
+        add             x12, x12, x4
+        mov             w5,  w8
+        b               1b
+0:
+        ret
+.purgem add5
+endfunc
+
+sgr_funcs 16
diff --git a/src/arm/64/looprestoration_common.S b/src/arm/64/looprestoration_common.S
index dc07827..200eb63 100644
--- a/src/arm/64/looprestoration_common.S
+++ b/src/arm/64/looprestoration_common.S
@@ -328,10 +328,13 @@ function sgr_box5_v_neon, export=1
 endfunc
 
 // void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
-//                              const int w, const int h, const int strength);
+//                              const int w, const int h, const int strength,
+//                              const int bitdepth_max);
 // void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
-//                              const int w, const int h, const int strength);
+//                              const int w, const int h, const int strength,
+//                              const int bitdepth_max);
 function sgr_calc_ab1_neon, export=1
+        clz             w9,  w5
         add             x3,  x3,  #2 // h += 2
         movi            v31.4s,   #9 // n
         mov             x5,  #455
@@ -340,6 +343,7 @@ function sgr_calc_ab1_neon, export=1
 endfunc
 
 function sgr_calc_ab2_neon, export=1
+        clz             w9,  w5
         add             x3,  x3,  #3  // h += 3
         asr             x3,  x3,  #1  // h /= 2
         movi            v31.4s,   #25 // n
@@ -348,14 +352,17 @@ function sgr_calc_ab2_neon, export=1
 endfunc
 
 function sgr_calc_ab_neon
+        sub             w9,  w9,  #24  // -bitdepth_min_8
         movrel          x12, X(sgr_x_by_x)
         ld1             {v16.16b, v17.16b, v18.16b}, [x12]
+        dup             v6.8h,    w9   // -bitdepth_min_8
         movi            v19.16b,  #5
         movi            v20.8b,   #55  // idx of last 5
         movi            v21.8b,   #72  // idx of last 4
         movi            v22.8b,   #101 // idx of last 3
         movi            v23.8b,   #169 // idx of last 2
         movi            v24.8b,   #254 // idx of last 1
+        saddl           v7.4s,    v6.4h,   v6.4h  // -2*bitdepth_min_8
         add             x2,  x2,  #2 // w += 2
         add             x7,  x2,  #7
         bic             x7,  x7,  #7 // aligned w
@@ -373,10 +380,13 @@ function sgr_calc_ab_neon
         subs            x2,  x2,  #8
         ld1             {v0.4s, v1.4s}, [x0]   // a
         ld1             {v2.8h}, [x1]          // b
+        srshl           v0.4s,  v0.4s,  v7.4s
+        srshl           v1.4s,  v1.4s,  v7.4s
+        srshl           v4.8h,  v2.8h,  v6.8h
         mul             v0.4s,  v0.4s,  v31.4s // a * n
         mul             v1.4s,  v1.4s,  v31.4s // a * n
-        umull           v3.4s,  v2.4h,  v2.4h  // b * b
-        umull2          v4.4s,  v2.8h,  v2.8h  // b * b
+        umull           v3.4s,  v4.4h,  v4.4h  // b * b
+        umull2          v4.4s,  v4.8h,  v4.8h  // b * b
         uqsub           v0.4s,  v0.4s,  v3.4s  // imax(a * n - b * b, 0)
         uqsub           v1.4s,  v1.4s,  v4.4s  // imax(a * n - b * b, 0)
         mul             v0.4s,  v0.4s,  v28.4s // p * s
@@ -389,13 +399,13 @@ function sgr_calc_ab_neon
         cmhi            v26.8b, v0.8b,  v21.8b // = -1 if sgr_x_by_x[v0] < 4
         tbl             v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
         cmhi            v27.8b, v0.8b,  v22.8b // = -1 if sgr_x_by_x[v0] < 3
-        cmhi            v5.8b,  v0.8b,  v23.8b // = -1 if sgr_x_by_x[v0] < 2
+        cmhi            v4.8b,  v0.8b,  v23.8b // = -1 if sgr_x_by_x[v0] < 2
         add             v25.8b, v25.8b, v26.8b
-        cmhi            v6.8b,  v0.8b,  v24.8b // = -1 if sgr_x_by_x[v0] < 1
-        add             v27.8b, v27.8b, v5.8b
-        add             v6.8b,  v6.8b,  v19.8b
+        cmhi            v5.8b,  v0.8b,  v24.8b // = -1 if sgr_x_by_x[v0] < 1
+        add             v27.8b, v27.8b, v4.8b
+        add             v5.8b,  v5.8b,  v19.8b
         add             v25.8b, v25.8b, v27.8b
-        add             v1.8b,  v1.8b,  v6.8b
+        add             v1.8b,  v1.8b,  v5.8b
         add             v1.8b,  v1.8b,  v25.8b
         uxtl            v1.8h,  v1.8b          // x
 
diff --git a/src/arm/64/looprestoration_tmpl.S b/src/arm/64/looprestoration_tmpl.S
index 27c952d..520365b 100644
--- a/src/arm/64/looprestoration_tmpl.S
+++ b/src/arm/64/looprestoration_tmpl.S
@@ -29,11 +29,12 @@
 
 #define FILTER_OUT_STRIDE 384
 
-// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,
+.macro sgr_funcs bpc
+// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp,
 //                                         const pixel *src, const ptrdiff_t stride,
 //                                         const int32_t *a, const int16_t *b,
 //                                         const int w, const int h);
-function sgr_finish_filter1_8bpc_neon, export=1
+function sgr_finish_filter1_\bpc\()bpc_neon, export=1
         sub             x7,  x3,  #(4*SUM_STRIDE)
         add             x8,  x3,  #(4*SUM_STRIDE)
         sub             x9,  x4,  #(2*SUM_STRIDE)
@@ -42,7 +43,11 @@ function sgr_finish_filter1_8bpc_neon, export=1
         mov             x12, #FILTER_OUT_STRIDE
         add             x13, x5,  #7
         bic             x13, x13, #7 // Aligned width
+.if \bpc == 8
         sub             x2,  x2,  x13
+.else
+        sub             x2,  x2,  x13, lsl #1
+.endif
         sub             x12, x12, x13
         sub             x11, x11, x13
         sub             x11, x11, #4 // We read 4 extra elements from a
@@ -98,7 +103,11 @@ function sgr_finish_filter1_8bpc_neon, export=1
         ext             v28.16b, v23.16b, v24.16b, #4
         ext             v29.16b, v22.16b, v23.16b, #8 // +1+stride
         ext             v30.16b, v23.16b, v24.16b, #8
+.if \bpc == 8
         ld1             {v19.8b}, [x1], #8            // src
+.else
+        ld1             {v19.8h}, [x1], #16           // src
+.endif
         add             v25.4s,  v25.4s,  v27.4s      // +stride
         add             v26.4s,  v26.4s,  v28.4s
         add             v16.4s,  v16.4s,  v29.4s      // +1+stride
@@ -107,7 +116,9 @@ function sgr_finish_filter1_8bpc_neon, export=1
         shl             v26.4s,  v26.4s,  #2
         mla             v25.4s,  v16.4s,  v7.4s       // * 3 -> b
         mla             v26.4s,  v17.4s,  v7.4s
+.if \bpc == 8
         uxtl            v19.8h,  v19.8b               // src
+.endif
         mov             v0.16b,  v1.16b
         umlal           v25.4s,  v2.4h,   v19.4h      // b + a * src
         umlal2          v26.4s,  v2.8h,   v19.8h
@@ -146,11 +157,11 @@ function sgr_finish_filter1_8bpc_neon, export=1
         ret
 endfunc
 
-// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,
+// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp,
 //                                         const pixel *src, const ptrdiff_t stride,
 //                                         const int32_t *a, const int16_t *b,
 //                                         const int w, const int h);
-function sgr_finish_filter2_8bpc_neon, export=1
+function sgr_finish_filter2_\bpc\()bpc_neon, export=1
         add             x7,  x3,  #(4*(SUM_STRIDE))
         sub             x3,  x3,  #(4*(SUM_STRIDE))
         add             x8,  x4,  #(2*(SUM_STRIDE))
@@ -159,7 +170,11 @@ function sgr_finish_filter2_8bpc_neon, export=1
         mov             x10, #FILTER_OUT_STRIDE
         add             x11, x5,  #7
         bic             x11, x11, #7 // Aligned width
+.if \bpc == 8
         sub             x2,  x2,  x11
+.else
+        sub             x2,  x2,  x11, lsl #1
+.endif
         sub             x10, x10, x11
         sub             x9,  x9,  x11
         sub             x9,  x9,  #4 // We read 4 extra elements from a
@@ -196,7 +211,11 @@ function sgr_finish_filter2_8bpc_neon, export=1
         ext             v29.16b, v20.16b, v21.16b, #8
         mul             v0.8h,   v0.8h,   v4.8h       // * 5
         mla             v0.8h,   v2.8h,   v6.8h       // * 6
+.if \bpc == 8
         ld1             {v31.8b}, [x1], #8
+.else
+        ld1             {v31.8h}, [x1], #16
+.endif
         add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride
         add             v17.4s,  v17.4s,  v27.4s
         add             v19.4s,  v19.4s,  v28.4s      // -1+stride, +1+stride
@@ -213,7 +232,9 @@ function sgr_finish_filter2_8bpc_neon, export=1
         mul             v17.4s,  v17.4s,  v5.4s       // * 5
         mla             v17.4s,  v23.4s,  v7.4s       // * 6
 
+.if \bpc == 8
         uxtl            v31.8h,  v31.8b
+.endif
         umlal           v16.4s,  v0.4h,   v31.4h      // b + a * src
         umlal2          v17.4s,  v0.8h,   v31.8h
         mov             v0.16b,  v1.16b
@@ -259,10 +280,16 @@ function sgr_finish_filter2_8bpc_neon, export=1
         ext             v27.16b, v17.16b, v18.16b, #8
         mul             v2.8h,   v22.8h,  v6.8h       // * 6
         mla             v2.8h,   v0.8h,   v4.8h       // * 5 -> a
+.if \bpc == 8
         ld1             {v31.8b}, [x1], #8
+.else
+        ld1             {v31.8h}, [x1], #16
+.endif
         add             v16.4s,  v16.4s,  v26.4s      // -1, +1
         add             v17.4s,  v17.4s,  v27.4s
+.if \bpc == 8
         uxtl            v31.8h,  v31.8b
+.endif
         // This is, surprisingly, faster than other variants where the
         // mul+mla pairs are further apart, on Cortex A53.
         mul             v24.4s,  v24.4s,  v7.4s       // * 6
@@ -296,13 +323,19 @@ function sgr_finish_filter2_8bpc_neon, export=1
         ret
 endfunc
 
-// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,
+// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride,
 //                                    const pixel *src, const ptrdiff_t src_stride,
 //                                    const int16_t *t1, const int w, const int h,
-//                                    const int wt);
-function sgr_weighted1_8bpc_neon, export=1
+//                                    const int wt, const int bitdepth_max);
+function sgr_weighted1_\bpc\()bpc_neon, export=1
+.if \bpc == 16
+        ldr             w8,  [sp]
+.endif
         dup             v31.8h, w7
         cmp             x6,  #2
+.if \bpc == 16
+        dup             v30.8h, w8
+.endif
         add             x9,  x0,  x1
         add             x10, x2,  x3
         add             x11, x4,  #2*FILTER_OUT_STRIDE
@@ -311,19 +344,34 @@ function sgr_weighted1_8bpc_neon, export=1
         lsl             x3,  x3,  #1
         add             x8,  x5,  #7
         bic             x8,  x8,  #7 // Aligned width
+.if \bpc == 8
         sub             x1,  x1,  x8
         sub             x3,  x3,  x8
+.else
+        sub             x1,  x1,  x8, lsl #1
+        sub             x3,  x3,  x8, lsl #1
+.endif
         sub             x7,  x7,  x8, lsl #1
         mov             x8,  x5
         b.lt            2f
 1:
+.if \bpc == 8
         ld1             {v0.8b}, [x2],  #8
         ld1             {v4.8b}, [x10], #8
+.else
+        ld1             {v0.8h}, [x2],  #16
+        ld1             {v4.8h}, [x10], #16
+.endif
         ld1             {v1.8h}, [x4],  #16
         ld1             {v5.8h}, [x11], #16
         subs            x5,  x5,  #8
+.if \bpc == 8
         ushll           v0.8h,  v0.8b,  #4     // u
         ushll           v4.8h,  v4.8b,  #4     // u
+.else
+        shl             v0.8h,  v0.8h,  #4     // u
+        shl             v4.8h,  v4.8h,  #4     // u
+.endif
         sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
         sub             v5.8h,  v5.8h,  v4.8h  // t1 - u
         ushll           v2.4s,  v0.4h,  #7     // u << 7
@@ -334,6 +382,7 @@ function sgr_weighted1_8bpc_neon, export=1
         smlal2          v3.4s,  v1.8h,  v31.8h // v
         smlal           v6.4s,  v5.4h,  v31.4h // v
         smlal2          v7.4s,  v5.8h,  v31.8h // v
+.if \bpc == 8
         rshrn           v2.4h,  v2.4s,  #11
         rshrn2          v2.8h,  v3.4s,  #11
         rshrn           v6.4h,  v6.4s,  #11
@@ -342,6 +391,16 @@ function sgr_weighted1_8bpc_neon, export=1
         sqxtun          v6.8b,  v6.8h
         st1             {v2.8b}, [x0], #8
         st1             {v6.8b}, [x9], #8
+.else
+        sqrshrun        v2.4h,  v2.4s,  #11
+        sqrshrun2       v2.8h,  v3.4s,  #11
+        sqrshrun        v6.4h,  v6.4s,  #11
+        sqrshrun2       v6.8h,  v7.4s,  #11
+        umin            v2.8h,  v2.8h,  v30.8h
+        umin            v6.8h,  v6.8h,  v30.8h
+        st1             {v2.8h}, [x0], #16
+        st1             {v6.8h}, [x9], #16
+.endif
         b.gt            1b
 
         sub             x6,  x6,  #2
@@ -358,57 +417,94 @@ function sgr_weighted1_8bpc_neon, export=1
         b               1b
 
 2:
+.if \bpc == 8
         ld1             {v0.8b}, [x2], #8
+.else
+        ld1             {v0.8h}, [x2], #16
+.endif
         ld1             {v1.8h}, [x4], #16
         subs            x5,  x5,  #8
+.if \bpc == 8
         ushll           v0.8h,  v0.8b,  #4     // u
+.else
+        shl             v0.8h,  v0.8h,  #4     // u
+.endif
         sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
         ushll           v2.4s,  v0.4h,  #7     // u << 7
         ushll2          v3.4s,  v0.8h,  #7     // u << 7
         smlal           v2.4s,  v1.4h,  v31.4h // v
         smlal2          v3.4s,  v1.8h,  v31.8h // v
+.if \bpc == 8
         rshrn           v2.4h,  v2.4s,  #11
         rshrn2          v2.8h,  v3.4s,  #11
         sqxtun          v2.8b,  v2.8h
         st1             {v2.8b}, [x0], #8
+.else
+        sqrshrun        v2.4h,  v2.4s,  #11
+        sqrshrun2       v2.8h,  v3.4s,  #11
+        umin            v2.8h,  v2.8h,  v30.8h
+        st1             {v2.8h}, [x0], #16
+.endif
         b.gt            2b
 0:
         ret
 endfunc
 
-// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
 //                                    const pixel *src, const ptrdiff_t src_stride,
 //                                    const int16_t *t1, const int16_t *t2,
 //                                    const int w, const int h,
 //                                    const int16_t wt[2]);
-function sgr_weighted2_8bpc_neon, export=1
+function sgr_weighted2_\bpc\()bpc_neon, export=1
+.if \bpc == 8
         ldr             x8,  [sp]
+.else
+        ldp             x8,  x9,  [sp]
+.endif
         cmp             x7,  #2
         add             x10, x0,  x1
         add             x11, x2,  x3
         add             x12, x4,  #2*FILTER_OUT_STRIDE
         add             x13, x5,  #2*FILTER_OUT_STRIDE
         ld2r            {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
+.if \bpc == 16
+        dup             v29.8h,  w9
+.endif
         mov             x8,  #4*FILTER_OUT_STRIDE
         lsl             x1,  x1,  #1
         lsl             x3,  x3,  #1
         add             x9,  x6,  #7
         bic             x9,  x9,  #7 // Aligned width
+.if \bpc == 8
         sub             x1,  x1,  x9
         sub             x3,  x3,  x9
+.else
+        sub             x1,  x1,  x9, lsl #1
+        sub             x3,  x3,  x9, lsl #1
+.endif
         sub             x8,  x8,  x9, lsl #1
         mov             x9,  x6
         b.lt            2f
 1:
+.if \bpc == 8
         ld1             {v0.8b},  [x2],  #8
         ld1             {v16.8b}, [x11], #8
+.else
+        ld1             {v0.8h},  [x2],  #16
+        ld1             {v16.8h}, [x11], #16
+.endif
         ld1             {v1.8h},  [x4],  #16
         ld1             {v17.8h}, [x12], #16
         ld1             {v2.8h},  [x5],  #16
         ld1             {v18.8h}, [x13], #16
         subs            x6,  x6,  #8
+.if \bpc == 8
         ushll           v0.8h,  v0.8b,  #4     // u
         ushll           v16.8h, v16.8b, #4     // u
+.else
+        shl             v0.8h,  v0.8h,  #4     // u
+        shl             v16.8h, v16.8h, #4     // u
+.endif
         sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
         sub             v2.8h,  v2.8h,  v0.8h  // t2 - u
         sub             v17.8h, v17.8h, v16.8h // t1 - u
@@ -425,6 +521,7 @@ function sgr_weighted2_8bpc_neon, export=1
         smlal           v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
         smlal2          v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
         smlal2          v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
+.if \bpc == 8
         rshrn           v3.4h,  v3.4s,  #11
         rshrn2          v3.8h,  v4.4s,  #11
         rshrn           v19.4h, v19.4s, #11
@@ -433,6 +530,16 @@ function sgr_weighted2_8bpc_neon, export=1
         sqxtun          v19.8b, v19.8h
         st1             {v3.8b},  [x0],  #8
         st1             {v19.8b}, [x10], #8
+.else
+        sqrshrun        v3.4h,  v3.4s,  #11
+        sqrshrun2       v3.8h,  v4.4s,  #11
+        sqrshrun        v19.4h, v19.4s, #11
+        sqrshrun2       v19.8h, v20.4s, #11
+        umin            v3.8h,  v3.8h,  v29.8h
+        umin            v19.8h, v19.8h, v29.8h
+        st1             {v3.8h},  [x0],  #16
+        st1             {v19.8h}, [x10], #16
+.endif
         b.gt            1b
 
         subs            x7,  x7,  #2
@@ -451,11 +558,19 @@ function sgr_weighted2_8bpc_neon, export=1
         b               1b
 
 2:
+.if \bpc == 8
         ld1             {v0.8b}, [x2], #8
+.else
+        ld1             {v0.8h}, [x2], #16
+.endif
         ld1             {v1.8h}, [x4], #16
         ld1             {v2.8h}, [x5], #16
         subs            x6,  x6,  #8
+.if \bpc == 8
         ushll           v0.8h,  v0.8b,  #4     // u
+.else
+        shl             v0.8h,  v0.8h,  #4     // u
+.endif
         sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
         sub             v2.8h,  v2.8h,  v0.8h  // t2 - u
         ushll           v3.4s,  v0.4h,  #7     // u << 7
@@ -464,11 +579,19 @@ function sgr_weighted2_8bpc_neon, export=1
         smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)
         smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)
         smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)
+.if \bpc == 8
         rshrn           v3.4h,  v3.4s,  #11
         rshrn2          v3.8h,  v4.4s,  #11
         sqxtun          v3.8b,  v3.8h
         st1             {v3.8b}, [x0], #8
+.else
+        sqrshrun        v3.4h,  v3.4s,  #11
+        sqrshrun2       v3.8h,  v4.4s,  #11
+        umin            v3.8h,  v3.8h,  v29.8h
+        st1             {v3.8h}, [x0], #16
+.endif
         b.gt            1b
 0:
         ret
 endfunc
+.endm
diff --git a/src/arm/looprestoration_init_tmpl.c b/src/arm/looprestoration_init_tmpl.c
index 17bac48..1f18d62 100644
--- a/src/arm/looprestoration_init_tmpl.c
+++ b/src/arm/looprestoration_init_tmpl.c
@@ -104,9 +104,7 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
         BF(dav1d_copy_narrow, neon)(dst + (w & ~7), dst_stride, tmp, w & 7, h);
     }
 }
-#endif
 
-#if BITDEPTH == 8
 void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
                                 const pixel (*left)[4],
                                 const pixel *src, const ptrdiff_t stride,
@@ -116,7 +114,8 @@ void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
                            const int w, const int h,
                            const enum LrEdgeFlags edges);
 void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
-                             const int w, const int h, const int strength);
+                             const int w, const int h, const int strength,
+                             const int bitdepth_max);
 void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp,
                                         const pixel *src, const ptrdiff_t stride,
                                         const int32_t *a, const int16_t *b,
@@ -147,7 +146,7 @@ static void dav1d_sgr_filter1_neon(int16_t *tmp,
                                    lpf_stride, w, 2, edges);
 
     dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
-    dav1d_sgr_calc_ab1_neon(a, b, w, h, strength);
+    dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX);
     BF(dav1d_sgr_finish_filter1, neon)(tmp, src, stride, a, b, w, h);
 }
 
@@ -160,7 +159,8 @@ void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
                            const int w, const int h,
                            const enum LrEdgeFlags edges);
 void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
-                             const int w, const int h, const int strength);
+                             const int w, const int h, const int strength,
+                             const int bitdepth_max);
 void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp,
                                         const pixel *src, const ptrdiff_t stride,
                                         const int32_t *a, const int16_t *b,
@@ -191,7 +191,7 @@ static void dav1d_sgr_filter2_neon(int16_t *tmp,
                                    lpf_stride, w, 2, edges);
 
     dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
-    dav1d_sgr_calc_ab2_neon(a, b, w, h, strength);
+    dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX);
     BF(dav1d_sgr_finish_filter2, neon)(tmp, src, stride, a, b, w, h);
 }
 
@@ -292,8 +292,7 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPCont
 
 #if BITDEPTH == 8 || ARCH_AARCH64
     c->wiener = wiener_filter_neon;
-#endif
-#if BITDEPTH == 8
-    c->selfguided = sgr_filter_neon;
+    if (bpc <= 10)
+        c->selfguided = sgr_filter_neon;
 #endif
 }
-- 
cgit v1.2.3