arm64: looprestoration: Prepare for 16 bpc by splitting code to separate files

looprestoration_common.S contains functions that can be used as is with one single instantiation of the functions for both 8 and 16 bpc. This file will be built once, regardless of which bitdepths are enabled. looprestoration_tmpl.S contains functions where the source can be shared and templated between 8 and 16 bpc. This will be included by the separate 8/16bpc implementaton files.
author: Martin Storsjö <martin@martin.st> 2020-02-10 00:39:11 +0300
committer: Martin Storsjö <martin@martin.st> 2020-02-11 11:43:43 +0300
commit: 7cf5d7535f44d7c2d00e368575d0d26b66c73121 (patch)
tree: a1ed49b4da47a0382402e6c141b57e3e65437276 /src/arm
parent: 32e265a86e535b5fad47bcac9b54f83e1e5eab33 (diff)
3 files changed, 898 insertions, 837 deletions
diff --git a/src/arm/64/looprestoration.S b/src/arm/64/looprestoration.S
index 8a6bfab..b6f0934 100644
--- a/src/arm/64/looprestoration.S
+++ b/src/arm/64/looprestoration.S
@@ -617,6 +617,8 @@ endfunc
 
 #define SUM_STRIDE (384+16)
 
+#include "looprestoration_tmpl.S"
+
 // void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
 //                                 const pixel (*left)[4],
 //                                 const pixel *src, const ptrdiff_t stride,
@@ -1146,840 +1148,3 @@ L(box5_variable_shift_tbl):
         ret
 .purgem add5
 endfunc
-
-// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
-//                            const int w, const int h,
-//                            const enum LrEdgeFlags edges);
-function sgr_box3_v_neon, export=1
-        add             w10, w3,  #2 // Number of output rows to move back
-        mov             w11, w3      // Number of input rows to move back
-        add             w2,  w2,  #2 // Actual summed width
-        mov             x7,       #(4*SUM_STRIDE) // sumsq stride
-        mov             x8,       #(2*SUM_STRIDE) // sum stride
-        sub             x0,  x0,  #(4*SUM_STRIDE) // sumsq -= stride
-        sub             x1,  x1,  #(2*SUM_STRIDE) // sum   -= stride
-
-        tst             w4,  #4 // LR_HAVE_TOP
-        b.eq            0f
-        // If have top, read from row -2.
-        sub             x5,  x0,  #(4*SUM_STRIDE)
-        sub             x6,  x1,  #(2*SUM_STRIDE)
-        add             w11, w11, #2
-        b               1f
-0:
-        // !LR_HAVE_TOP
-        // If we don't have top, read from row 0 even if
-        // we start writing to row -1.
-        add             x5,  x0,  #(4*SUM_STRIDE)
-        add             x6,  x1,  #(2*SUM_STRIDE)
-1:
-
-        tst             w4,  #8 // LR_HAVE_BOTTOM
-        b.eq            1f
-        // LR_HAVE_BOTTOM
-        add             w3,  w3,  #2  // Sum all h+2 lines with the main loop
-        add             w11, w11, #2
-1:
-        mov             w9,  w3       // Backup of h for next loops
-
-1:
-        // Start of horizontal loop; start one vertical filter slice.
-        // Start loading rows into v16-v21 and v24-v26 taking top
-        // padding into consideration.
-        tst             w4,  #4 // LR_HAVE_TOP
-        ld1             {v16.4s, v17.4s}, [x5], x7
-        ld1             {v24.8h},         [x6], x8
-        b.eq            2f
-        // LR_HAVE_TOP
-        ld1             {v18.4s, v19.4s}, [x5], x7
-        ld1             {v25.8h},         [x6], x8
-        ld1             {v20.4s, v21.4s}, [x5], x7
-        ld1             {v26.8h},         [x6], x8
-        b               3f
-2:      // !LR_HAVE_TOP
-        mov             v18.16b, v16.16b
-        mov             v19.16b, v17.16b
-        mov             v25.16b, v24.16b
-        mov             v20.16b, v16.16b
-        mov             v21.16b, v17.16b
-        mov             v26.16b, v24.16b
-
-3:
-        subs            w3,  w3,  #1
-.macro add3
-        add             v16.4s,  v16.4s,  v18.4s
-        add             v17.4s,  v17.4s,  v19.4s
-        add             v24.8h,  v24.8h,  v25.8h
-        add             v16.4s,  v16.4s,  v20.4s
-        add             v17.4s,  v17.4s,  v21.4s
-        add             v24.8h,  v24.8h,  v26.8h
-        st1             {v16.4s, v17.4s}, [x0], x7
-        st1             {v24.8h},         [x1], x8
-.endm
-        add3
-        mov             v16.16b, v18.16b
-        mov             v17.16b, v19.16b
-        mov             v24.16b, v25.16b
-        mov             v18.16b, v20.16b
-        mov             v19.16b, v21.16b
-        mov             v25.16b, v26.16b
-        b.le            4f
-        ld1             {v20.4s, v21.4s}, [x5], x7
-        ld1             {v26.8h},         [x6], x8
-        b               3b
-
-4:
-        tst             w4,  #8 // LR_HAVE_BOTTOM
-        b.ne            5f
-        // !LR_HAVE_BOTTOM
-        // Produce two more rows, extending the already loaded rows.
-        add3
-        mov             v16.16b, v18.16b
-        mov             v17.16b, v19.16b
-        mov             v24.16b, v25.16b
-        add3
-
-5:      // End of one vertical slice.
-        subs            w2,  w2,  #8
-        b.le            0f
-        // Move pointers back up to the top and loop horizontally.
-        // Input pointers
-        msub            x5,  x7,  x11, x5
-        msub            x6,  x8,  x11, x6
-        // Output pointers
-        msub            x0,  x7,  x10, x0
-        msub            x1,  x8,  x10, x1
-        add             x0,  x0,  #32
-        add             x1,  x1,  #16
-        add             x5,  x5,  #32
-        add             x6,  x6,  #16
-        mov             w3,  w9
-        b               1b
-
-0:
-        ret
-.purgem add3
-endfunc
-
-// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
-//                            const int w, const int h,
-//                            const enum LrEdgeFlags edges);
-function sgr_box5_v_neon, export=1
-        add             w10, w3,  #2 // Number of output rows to move back
-        mov             w11, w3      // Number of input rows to move back
-        add             w2,  w2,  #8 // Actual summed width
-        mov             x7,       #(4*SUM_STRIDE) // sumsq stride
-        mov             x8,       #(2*SUM_STRIDE) // sum stride
-        sub             x0,  x0,  #(4*SUM_STRIDE) // sumsq -= stride
-        sub             x1,  x1,  #(2*SUM_STRIDE) // sum   -= stride
-
-        tst             w4,  #4 // LR_HAVE_TOP
-        b.eq            0f
-        // If have top, read from row -2.
-        sub             x5,  x0,  #(4*SUM_STRIDE)
-        sub             x6,  x1,  #(2*SUM_STRIDE)
-        add             w11, w11, #2
-        b               1f
-0:
-        // !LR_HAVE_TOP
-        // If we don't have top, read from row 0 even if
-        // we start writing to row -1.
-        add             x5,  x0,  #(4*SUM_STRIDE)
-        add             x6,  x1,  #(2*SUM_STRIDE)
-1:
-
-        tst             w4,  #8 // LR_HAVE_BOTTOM
-        b.eq            0f
-        // LR_HAVE_BOTTOM
-        add             w3,  w3,  #2  // Handle h+2 lines with the main loop
-        add             w11, w11, #2
-        b               1f
-0:
-        // !LR_HAVE_BOTTOM
-        sub             w3,  w3,  #1  // Handle h-1 lines with the main loop
-1:
-        mov             w9,  w3       // Backup of h for next loops
-
-1:
-        // Start of horizontal loop; start one vertical filter slice.
-        // Start loading rows into v16-v25 and v26-v30 taking top
-        // padding into consideration.
-        tst             w4,  #4 // LR_HAVE_TOP
-        ld1             {v16.4s, v17.4s}, [x5], x7
-        ld1             {v26.8h},         [x6], x8
-        b.eq            2f
-        // LR_HAVE_TOP
-        ld1             {v20.4s, v21.4s}, [x5], x7
-        ld1             {v28.8h},         [x6], x8
-        mov             v18.16b, v16.16b
-        mov             v19.16b, v17.16b
-        mov             v27.16b, v26.16b
-        ld1             {v22.4s, v23.4s}, [x5], x7
-        ld1             {v29.8h},         [x6], x8
-        b               3f
-2:      // !LR_HAVE_TOP
-        mov             v18.16b, v16.16b
-        mov             v19.16b, v17.16b
-        mov             v27.16b, v26.16b
-        mov             v20.16b, v16.16b
-        mov             v21.16b, v17.16b
-        mov             v28.16b, v26.16b
-        mov             v22.16b, v16.16b
-        mov             v23.16b, v17.16b
-        mov             v29.16b, v26.16b
-
-3:
-        cbz             w3,  4f
-        ld1             {v24.4s, v25.4s}, [x5], x7
-        ld1             {v30.8h},         [x6], x8
-
-3:
-        // Start of vertical loop
-        subs            w3,  w3,  #2
-.macro add5
-        add             v16.4s,  v16.4s,  v18.4s
-        add             v17.4s,  v17.4s,  v19.4s
-        add             v26.8h,  v26.8h,  v27.8h
-        add             v0.4s,   v20.4s,  v22.4s
-        add             v1.4s,   v21.4s,  v23.4s
-        add             v2.8h,   v28.8h,  v29.8h
-        add             v16.4s,  v16.4s,  v24.4s
-        add             v17.4s,  v17.4s,  v25.4s
-        add             v26.8h,  v26.8h,  v30.8h
-        add             v16.4s,  v16.4s,  v0.4s
-        add             v17.4s,  v17.4s,  v1.4s
-        add             v26.8h,  v26.8h,  v2.8h
-        st1             {v16.4s, v17.4s}, [x0], x7
-        st1             {v26.8h},         [x1], x8
-.endm
-        add5
-.macro shift2
-        mov             v16.16b, v20.16b
-        mov             v17.16b, v21.16b
-        mov             v26.16b, v28.16b
-        mov             v18.16b, v22.16b
-        mov             v19.16b, v23.16b
-        mov             v27.16b, v29.16b
-        mov             v20.16b, v24.16b
-        mov             v21.16b, v25.16b
-        mov             v28.16b, v30.16b
-.endm
-        shift2
-        add             x0,  x0,  x7
-        add             x1,  x1,  x8
-        b.le            5f
-        ld1             {v22.4s, v23.4s}, [x5], x7
-        ld1             {v29.8h},         [x6], x8
-        ld1             {v24.4s, v25.4s}, [x5], x7
-        ld1             {v30.8h},         [x6], x8
-        b               3b
-
-4:
-        // h == 1, !LR_HAVE_BOTTOM.
-        // Pad the last row with the only content row, and add.
-        mov             v24.16b, v22.16b
-        mov             v25.16b, v23.16b
-        mov             v30.16b, v29.16b
-        add5
-        shift2
-        add             x0,  x0,  x7
-        add             x1,  x1,  x8
-        add5
-        b               6f
-
-5:
-        tst             w4,  #8 // LR_HAVE_BOTTOM
-        b.ne            6f
-        // !LR_HAVE_BOTTOM
-        cbnz            w3,  5f
-        // The intended three edge rows left; output the one at h-2 and
-        // the past edge one at h.
-        ld1             {v22.4s, v23.4s}, [x5], x7
-        ld1             {v29.8h},         [x6], x8
-        // Pad the past-edge row from the last content row.
-        mov             v24.16b, v22.16b
-        mov             v25.16b, v23.16b
-        mov             v30.16b, v29.16b
-        add5
-        shift2
-        add             x0,  x0,  x7
-        add             x1,  x1,  x8
-        // The last two rows are already padded properly here.
-        add5
-        b               6f
-
-5:
-        // w3 == -1, two rows left, output one.
-        // Pad the last two rows from the mid one.
-        mov             v22.16b, v20.16b
-        mov             v23.16b, v21.16b
-        mov             v29.16b, v28.16b
-        mov             v24.16b, v20.16b
-        mov             v25.16b, v21.16b
-        mov             v30.16b, v28.16b
-        add5
-        add             x0,  x0,  x7
-        add             x1,  x1,  x8
-        b               6f
-
-6:      // End of one vertical slice.
-        subs            w2,  w2,  #8
-        b.le            0f
-        // Move pointers back up to the top and loop horizontally.
-        // Input pointers
-        msub            x5,  x7,  x11, x5
-        msub            x6,  x8,  x11, x6
-        // Output pointers
-        msub            x0,  x7,  x10, x0
-        msub            x1,  x8,  x10, x1
-        add             x0,  x0,  #32
-        add             x1,  x1,  #16
-        add             x5,  x5,  #32
-        add             x6,  x6,  #16
-        mov             w3,  w9
-        b               1b
-
-0:
-        ret
-.purgem add5
-endfunc
-
-// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
-//                              const int w, const int h, const int strength);
-// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
-//                              const int w, const int h, const int strength);
-function sgr_calc_ab1_neon, export=1
-        add             x3,  x3,  #2 // h += 2
-        movi            v31.4s,   #9 // n
-        mov             x5,  #455
-        mov             x8,  #SUM_STRIDE
-        b               sgr_calc_ab_neon
-endfunc
-
-function sgr_calc_ab2_neon, export=1
-        add             x3,  x3,  #3  // h += 3
-        asr             x3,  x3,  #1  // h /= 2
-        movi            v31.4s,   #25 // n
-        mov             x5,  #164
-        mov             x8,  #(2*SUM_STRIDE)
-endfunc
-
-function sgr_calc_ab_neon
-        movrel          x12, X(sgr_x_by_x)
-        ld1             {v16.16b, v17.16b, v18.16b}, [x12]
-        movi            v19.16b,  #5
-        movi            v20.8b,   #55  // idx of last 5
-        movi            v21.8b,   #72  // idx of last 4
-        movi            v22.8b,   #101 // idx of last 3
-        movi            v23.8b,   #169 // idx of last 2
-        movi            v24.8b,   #254 // idx of last 1
-        add             x2,  x2,  #2 // w += 2
-        add             x7,  x2,  #7
-        bic             x7,  x7,  #7 // aligned w
-        sub             x7,  x8,  x7 // increment between rows
-        movi            v29.8h,   #1, lsl #8
-        dup             v28.4s,   w4
-        dup             v30.4s,   w5 // one_by_x
-        sub             x0,  x0,  #(4*(SUM_STRIDE))
-        sub             x1,  x1,  #(2*(SUM_STRIDE))
-        mov             x6,  x2   // backup of w
-        sub             v16.16b, v16.16b, v19.16b
-        sub             v17.16b, v17.16b, v19.16b
-        sub             v18.16b, v18.16b, v19.16b
-1:
-        subs            x2,  x2,  #8
-        ld1             {v0.4s, v1.4s}, [x0]   // a
-        ld1             {v2.8h}, [x1]          // b
-        mul             v0.4s,  v0.4s,  v31.4s // a * n
-        mul             v1.4s,  v1.4s,  v31.4s // a * n
-        umull           v3.4s,  v2.4h,  v2.4h  // b * b
-        umull2          v4.4s,  v2.8h,  v2.8h  // b * b
-        uqsub           v0.4s,  v0.4s,  v3.4s  // imax(a * n - b * b, 0)
-        uqsub           v1.4s,  v1.4s,  v4.4s  // imax(a * n - b * b, 0)
-        mul             v0.4s,  v0.4s,  v28.4s // p * s
-        mul             v1.4s,  v1.4s,  v28.4s // p * s
-        uqshrn          v0.4h,  v0.4s,  #16
-        uqshrn2         v0.8h,  v1.4s,  #16
-        uqrshrn         v0.8b,  v0.8h,  #4     // imin(z, 255)
-
-        cmhi            v25.8b, v0.8b,  v20.8b // = -1 if sgr_x_by_x[v0] < 5
-        cmhi            v26.8b, v0.8b,  v21.8b // = -1 if sgr_x_by_x[v0] < 4
-        tbl             v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
-        cmhi            v27.8b, v0.8b,  v22.8b // = -1 if sgr_x_by_x[v0] < 3
-        cmhi            v5.8b,  v0.8b,  v23.8b // = -1 if sgr_x_by_x[v0] < 2
-        add             v25.8b, v25.8b, v26.8b
-        cmhi            v6.8b,  v0.8b,  v24.8b // = -1 if sgr_x_by_x[v0] < 1
-        add             v27.8b, v27.8b, v5.8b
-        add             v6.8b,  v6.8b,  v19.8b
-        add             v25.8b, v25.8b, v27.8b
-        add             v1.8b,  v1.8b,  v6.8b
-        add             v1.8b,  v1.8b,  v25.8b
-        uxtl            v1.8h,  v1.8b          // x
-
-        umull           v3.4s,  v1.4h,  v2.4h  // x * BB[i]
-        umull2          v4.4s,  v1.8h,  v2.8h  // x * BB[i]
-        mul             v3.4s,  v3.4s,  v30.4s // x * BB[i] * sgr_one_by_x
-        mul             v4.4s,  v4.4s,  v30.4s // x * BB[i] * sgr_one_by_x
-        srshr           v3.4s,  v3.4s,  #12    // AA[i]
-        srshr           v4.4s,  v4.4s,  #12    // AA[i]
-        sub             v2.8h,  v29.8h, v1.8h  // 256 - x
-
-        st1             {v3.4s, v4.4s}, [x0], #32
-        st1             {v2.8h}, [x1], #16
-        b.gt            1b
-
-        subs            x3,  x3,  #1
-        b.le            0f
-        add             x0,  x0,  x7, lsl #2
-        add             x1,  x1,  x7, lsl #1
-        mov             x2,  x6
-        b               1b
-0:
-        ret
-endfunc
-
-#define FILTER_OUT_STRIDE 384
-
-// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,
-//                                         const pixel *src, const ptrdiff_t stride,
-//                                         const int32_t *a, const int16_t *b,
-//                                         const int w, const int h);
-function sgr_finish_filter1_8bpc_neon, export=1
-        sub             x7,  x3,  #(4*SUM_STRIDE)
-        add             x8,  x3,  #(4*SUM_STRIDE)
-        sub             x9,  x4,  #(2*SUM_STRIDE)
-        add             x10, x4,  #(2*SUM_STRIDE)
-        mov             x11, #SUM_STRIDE
-        mov             x12, #FILTER_OUT_STRIDE
-        add             x13, x5,  #7
-        bic             x13, x13, #7 // Aligned width
-        sub             x2,  x2,  x13
-        sub             x12, x12, x13
-        sub             x11, x11, x13
-        sub             x11, x11, #4 // We read 4 extra elements from a
-        sub             x14, x11, #4 // We read 8 extra elements from b
-        mov             x13, x5
-        movi            v6.8h,  #3
-        movi            v7.4s,  #3
-1:
-        ld1             {v0.8h, v1.8h}, [x9], #32
-        ld1             {v2.8h, v3.8h}, [x4], #32
-        ld1             {v4.8h, v5.8h}, [x10], #32
-        ld1             {v16.4s, v17.4s, v18.4s}, [x7], #48
-        ld1             {v19.4s, v20.4s, v21.4s}, [x3], #48
-        ld1             {v22.4s, v23.4s, v24.4s}, [x8], #48
-
-2:
-        subs            x5,  x5,  #8
-        ext             v25.16b, v0.16b,  v1.16b, #2  // -stride
-        ext             v26.16b, v2.16b,  v3.16b, #2  // 0
-        ext             v27.16b, v4.16b,  v5.16b, #2  // +stride
-        ext             v28.16b, v0.16b,  v1.16b, #4  // +1-stride
-        ext             v29.16b, v2.16b,  v3.16b, #4  // +1
-        ext             v30.16b, v4.16b,  v5.16b, #4  // +1+stride
-        add             v2.8h,   v2.8h,   v25.8h      // -1, -stride
-        add             v26.8h,  v26.8h,  v27.8h      // 0, +stride
-        add             v0.8h,   v0.8h,   v28.8h      // -1-stride, +1-stride
-        add             v2.8h,   v2.8h,   v26.8h
-        add             v4.8h,   v4.8h,   v30.8h      // -1+stride, +1+stride
-        add             v2.8h,   v2.8h,   v29.8h      // +1
-        add             v0.8h,   v0.8h,   v4.8h
-
-        ext             v25.16b, v16.16b, v17.16b, #4 // -stride
-        ext             v26.16b, v17.16b, v18.16b, #4
-        shl             v2.8h,   v2.8h,   #2
-        ext             v27.16b, v16.16b, v17.16b, #8 // +1-stride
-        ext             v28.16b, v17.16b, v18.16b, #8
-        ext             v29.16b, v19.16b, v20.16b, #4 // 0
-        ext             v30.16b, v20.16b, v21.16b, #4
-        mla             v2.8h,   v0.8h,   v6.8h       // * 3 -> a
-        add             v25.4s,  v25.4s,  v19.4s      // -stride, -1
-        add             v26.4s,  v26.4s,  v20.4s
-        add             v16.4s,  v16.4s,  v27.4s      // -1-stride, +1-stride
-        add             v17.4s,  v17.4s,  v28.4s
-        ext             v27.16b, v19.16b, v20.16b, #8 // +1
-        ext             v28.16b, v20.16b, v21.16b, #8
-        add             v16.4s,  v16.4s,  v22.4s      // -1+stride
-        add             v17.4s,  v17.4s,  v23.4s
-        add             v29.4s,  v29.4s,  v27.4s      // 0, +1
-        add             v30.4s,  v30.4s,  v28.4s
-        add             v25.4s,  v25.4s,  v29.4s
-        add             v26.4s,  v26.4s,  v30.4s
-        ext             v27.16b, v22.16b, v23.16b, #4 // +stride
-        ext             v28.16b, v23.16b, v24.16b, #4
-        ext             v29.16b, v22.16b, v23.16b, #8 // +1+stride
-        ext             v30.16b, v23.16b, v24.16b, #8
-        ld1             {v19.8b}, [x1], #8            // src
-        add             v25.4s,  v25.4s,  v27.4s      // +stride
-        add             v26.4s,  v26.4s,  v28.4s
-        add             v16.4s,  v16.4s,  v29.4s      // +1+stride
-        add             v17.4s,  v17.4s,  v30.4s
-        shl             v25.4s,  v25.4s,  #2
-        shl             v26.4s,  v26.4s,  #2
-        mla             v25.4s,  v16.4s,  v7.4s       // * 3 -> b
-        mla             v26.4s,  v17.4s,  v7.4s
-        uxtl            v19.8h,  v19.8b               // src
-        mov             v0.16b,  v1.16b
-        umlal           v25.4s,  v2.4h,   v19.4h      // b + a * src
-        umlal2          v26.4s,  v2.8h,   v19.8h
-        mov             v2.16b,  v3.16b
-        rshrn           v25.4h,  v25.4s,  #9
-        rshrn2          v25.8h,  v26.4s,  #9
-        mov             v4.16b,  v5.16b
-        st1             {v25.8h}, [x0], #16
-
-        b.le            3f
-        mov             v16.16b, v18.16b
-        mov             v19.16b, v21.16b
-        mov             v22.16b, v24.16b
-        ld1             {v1.8h}, [x9], #16
-        ld1             {v3.8h}, [x4], #16
-        ld1             {v5.8h}, [x10], #16
-        ld1             {v17.4s, v18.4s}, [x7], #32
-        ld1             {v20.4s, v21.4s}, [x3], #32
-        ld1             {v23.4s, v24.4s}, [x8], #32
-        b               2b
-
-3:
-        subs            x6,  x6,  #1
-        b.le            0f
-        mov             x5,  x13
-        add             x0,  x0,  x12, lsl #1
-        add             x1,  x1,  x2
-        add             x3,  x3,  x11, lsl #2
-        add             x7,  x7,  x11, lsl #2
-        add             x8,  x8,  x11, lsl #2
-        add             x4,  x4,  x14, lsl #1
-        add             x9,  x9,  x14, lsl #1
-        add             x10, x10, x14, lsl #1
-        b               1b
-0:
-        ret
-endfunc
-
-// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,
-//                                         const pixel *src, const ptrdiff_t stride,
-//                                         const int32_t *a, const int16_t *b,
-//                                         const int w, const int h);
-function sgr_finish_filter2_8bpc_neon, export=1
-        add             x7,  x3,  #(4*(SUM_STRIDE))
-        sub             x3,  x3,  #(4*(SUM_STRIDE))
-        add             x8,  x4,  #(2*(SUM_STRIDE))
-        sub             x4,  x4,  #(2*(SUM_STRIDE))
-        mov             x9,  #(2*SUM_STRIDE)
-        mov             x10, #FILTER_OUT_STRIDE
-        add             x11, x5,  #7
-        bic             x11, x11, #7 // Aligned width
-        sub             x2,  x2,  x11
-        sub             x10, x10, x11
-        sub             x9,  x9,  x11
-        sub             x9,  x9,  #4 // We read 4 extra elements from a
-        sub             x12, x9,  #4 // We read 8 extra elements from b
-        mov             x11, x5
-        movi            v4.8h,  #5
-        movi            v5.4s,  #5
-        movi            v6.8h,  #6
-        movi            v7.4s,  #6
-1:
-        ld1             {v0.8h, v1.8h}, [x4], #32
-        ld1             {v2.8h, v3.8h}, [x8], #32
-        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48
-        ld1             {v19.4s, v20.4s, v21.4s}, [x7], #48
-
-2:
-        subs            x5,  x5,  #8
-        ext             v24.16b, v0.16b,  v1.16b, #4  // +1-stride
-        ext             v25.16b, v2.16b,  v3.16b, #4  // +1+stride
-        ext             v22.16b, v0.16b,  v1.16b, #2  // -stride
-        ext             v23.16b, v2.16b,  v3.16b, #2  // +stride
-        add             v0.8h,   v0.8h,   v24.8h      // -1-stride, +1-stride
-        add             v25.8h,  v2.8h,   v25.8h      // -1+stride, +1+stride
-        add             v2.8h,   v22.8h,  v23.8h      // -stride, +stride
-        add             v0.8h,   v0.8h,   v25.8h
-
-        ext             v22.16b, v16.16b, v17.16b, #4 // -stride
-        ext             v23.16b, v17.16b, v18.16b, #4
-        ext             v24.16b, v19.16b, v20.16b, #4 // +stride
-        ext             v25.16b, v20.16b, v21.16b, #4
-        ext             v26.16b, v16.16b, v17.16b, #8 // +1-stride
-        ext             v27.16b, v17.16b, v18.16b, #8
-        ext             v28.16b, v19.16b, v20.16b, #8 // +1+stride
-        ext             v29.16b, v20.16b, v21.16b, #8
-        mul             v0.8h,   v0.8h,   v4.8h       // * 5
-        mla             v0.8h,   v2.8h,   v6.8h       // * 6
-        ld1             {v31.8b}, [x1], #8
-        add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride
-        add             v17.4s,  v17.4s,  v27.4s
-        add             v19.4s,  v19.4s,  v28.4s      // -1+stride, +1+stride
-        add             v20.4s,  v20.4s,  v29.4s
-        add             v16.4s,  v16.4s,  v19.4s
-        add             v17.4s,  v17.4s,  v20.4s
-
-        add             v22.4s,  v22.4s,  v24.4s      // -stride, +stride
-        add             v23.4s,  v23.4s,  v25.4s
-        // This is, surprisingly, faster than other variants where the
-        // mul+mla pairs are further apart, on Cortex A53.
-        mul             v16.4s,  v16.4s,  v5.4s       // * 5
-        mla             v16.4s,  v22.4s,  v7.4s       // * 6
-        mul             v17.4s,  v17.4s,  v5.4s       // * 5
-        mla             v17.4s,  v23.4s,  v7.4s       // * 6
-
-        uxtl            v31.8h,  v31.8b
-        umlal           v16.4s,  v0.4h,   v31.4h      // b + a * src
-        umlal2          v17.4s,  v0.8h,   v31.8h
-        mov             v0.16b,  v1.16b
-        rshrn           v16.4h,  v16.4s,  #9
-        rshrn2          v16.8h,  v17.4s,  #9
-        mov             v2.16b,  v3.16b
-        st1             {v16.8h}, [x0], #16
-
-        b.le            3f
-        mov             v16.16b, v18.16b
-        mov             v19.16b, v21.16b
-        ld1             {v1.8h}, [x4], #16
-        ld1             {v3.8h}, [x8], #16
-        ld1             {v17.4s, v18.4s}, [x3], #32
-        ld1             {v20.4s, v21.4s}, [x7], #32
-        b               2b
-
-3:
-        subs            x6,  x6,  #1
-        b.le            0f
-        mov             x5,  x11
-        add             x0,  x0,  x10, lsl #1
-        add             x1,  x1,  x2
-        add             x3,  x3,  x9, lsl #2
-        add             x7,  x7,  x9, lsl #2
-        add             x4,  x4,  x12, lsl #1
-        add             x8,  x8,  x12, lsl #1
-        mov             x13, x3
-        mov             x14, x4
-
-        ld1             {v0.8h, v1.8h}, [x4], #32
-        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48
-
-4:
-        subs            x5,  x5,  #8
-        ext             v23.16b, v0.16b,  v1.16b, #4  // +1
-        ext             v22.16b, v0.16b,  v1.16b, #2  // 0
-        add             v0.8h,   v0.8h,   v23.8h      // -1, +1
-
-        ext             v24.16b, v16.16b, v17.16b, #4 // 0
-        ext             v25.16b, v17.16b, v18.16b, #4
-        ext             v26.16b, v16.16b, v17.16b, #8 // +1
-        ext             v27.16b, v17.16b, v18.16b, #8
-        mul             v2.8h,   v22.8h,  v6.8h       // * 6
-        mla             v2.8h,   v0.8h,   v4.8h       // * 5 -> a
-        ld1             {v31.8b}, [x1], #8
-        add             v16.4s,  v16.4s,  v26.4s      // -1, +1
-        add             v17.4s,  v17.4s,  v27.4s
-        uxtl            v31.8h,  v31.8b
-        // This is, surprisingly, faster than other variants where the
-        // mul+mla pairs are further apart, on Cortex A53.
-        mul             v24.4s,  v24.4s,  v7.4s       // * 6
-        mla             v24.4s,  v16.4s,  v5.4s       // * 5 -> b
-        mul             v25.4s,  v25.4s,  v7.4s       // * 6
-        mla             v25.4s,  v17.4s,  v5.4s       // * 5 -> b
-
-        umlal           v24.4s,  v2.4h,   v31.4h      // b + a * src
-        umlal2          v25.4s,  v2.8h,   v31.8h
-        mov             v0.16b,  v1.16b
-        rshrn           v24.4h,  v24.4s,  #8
-        rshrn2          v24.8h,  v25.4s,  #8
-        mov             v16.16b, v18.16b
-        st1             {v24.8h}, [x0], #16
-
-        b.le            5f
-        ld1             {v1.8h}, [x4], #16
-        ld1             {v17.4s, v18.4s}, [x3], #32
-        b               4b
-
-5:
-        subs            x6,  x6,  #1
-        b.le            0f
-        mov             x5,  x11
-        add             x0,  x0,  x10, lsl #1
-        add             x1,  x1,  x2
-        mov             x3,  x13 // Rewind x3/x4 to where they started
-        mov             x4,  x14
-        b               1b
-0:
-        ret
-endfunc
-
-// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,
-//                                    const pixel *src, const ptrdiff_t src_stride,
-//                                    const int16_t *t1, const int w, const int h,
-//                                    const int wt);
-function sgr_weighted1_8bpc_neon, export=1
-        dup             v31.8h, w7
-        cmp             x6,  #2
-        add             x9,  x0,  x1
-        add             x10, x2,  x3
-        add             x11, x4,  #2*FILTER_OUT_STRIDE
-        mov             x7,  #(4*FILTER_OUT_STRIDE)
-        lsl             x1,  x1,  #1
-        lsl             x3,  x3,  #1
-        add             x8,  x5,  #7
-        bic             x8,  x8,  #7 // Aligned width
-        sub             x1,  x1,  x8
-        sub             x3,  x3,  x8
-        sub             x7,  x7,  x8, lsl #1
-        mov             x8,  x5
-        b.lt            2f
-1:
-        ld1             {v0.8b}, [x2],  #8
-        ld1             {v4.8b}, [x10], #8
-        ld1             {v1.8h}, [x4],  #16
-        ld1             {v5.8h}, [x11], #16
-        subs            x5,  x5,  #8
-        ushll           v0.8h,  v0.8b,  #4     // u
-        ushll           v4.8h,  v4.8b,  #4     // u
-        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
-        sub             v5.8h,  v5.8h,  v4.8h  // t1 - u
-        ushll           v2.4s,  v0.4h,  #7     // u << 7
-        ushll2          v3.4s,  v0.8h,  #7     // u << 7
-        ushll           v6.4s,  v4.4h,  #7     // u << 7
-        ushll2          v7.4s,  v4.8h,  #7     // u << 7
-        smlal           v2.4s,  v1.4h,  v31.4h // v
-        smlal2          v3.4s,  v1.8h,  v31.8h // v
-        smlal           v6.4s,  v5.4h,  v31.4h // v
-        smlal2          v7.4s,  v5.8h,  v31.8h // v
-        rshrn           v2.4h,  v2.4s,  #11
-        rshrn2          v2.8h,  v3.4s,  #11
-        rshrn           v6.4h,  v6.4s,  #11
-        rshrn2          v6.8h,  v7.4s,  #11
-        sqxtun          v2.8b,  v2.8h
-        sqxtun          v6.8b,  v6.8h
-        st1             {v2.8b}, [x0], #8
-        st1             {v6.8b}, [x9], #8
-        b.gt            1b
-
-        sub             x6,  x6,  #2
-        cmp             x6,  #1
-        b.lt            0f
-        mov             x5,  x8
-        add             x0,  x0,  x1
-        add             x9,  x9,  x1
-        add             x2,  x2,  x3
-        add             x10, x10, x3
-        add             x4,  x4,  x7
-        add             x11, x11, x7
-        b.eq            2f
-        b               1b
-
-2:
-        ld1             {v0.8b}, [x2], #8
-        ld1             {v1.8h}, [x4], #16
-        subs            x5,  x5,  #8
-        ushll           v0.8h,  v0.8b,  #4     // u
-        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
-        ushll           v2.4s,  v0.4h,  #7     // u << 7
-        ushll2          v3.4s,  v0.8h,  #7     // u << 7
-        smlal           v2.4s,  v1.4h,  v31.4h // v
-        smlal2          v3.4s,  v1.8h,  v31.8h // v
-        rshrn           v2.4h,  v2.4s,  #11
-        rshrn2          v2.8h,  v3.4s,  #11
-        sqxtun          v2.8b,  v2.8h
-        st1             {v2.8b}, [x0], #8
-        b.gt            2b
-0:
-        ret
-endfunc
-
-// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,
-//                                    const pixel *src, const ptrdiff_t src_stride,
-//                                    const int16_t *t1, const int16_t *t2,
-//                                    const int w, const int h,
-//                                    const int16_t wt[2]);
-function sgr_weighted2_8bpc_neon, export=1
-        ldr             x8,  [sp]
-        cmp             x7,  #2
-        add             x10, x0,  x1
-        add             x11, x2,  x3
-        add             x12, x4,  #2*FILTER_OUT_STRIDE
-        add             x13, x5,  #2*FILTER_OUT_STRIDE
-        ld2r            {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
-        mov             x8,  #4*FILTER_OUT_STRIDE
-        lsl             x1,  x1,  #1
-        lsl             x3,  x3,  #1
-        add             x9,  x6,  #7
-        bic             x9,  x9,  #7 // Aligned width
-        sub             x1,  x1,  x9
-        sub             x3,  x3,  x9
-        sub             x8,  x8,  x9, lsl #1
-        mov             x9,  x6
-        b.lt            2f
-1:
-        ld1             {v0.8b},  [x2],  #8
-        ld1             {v16.8b}, [x11], #8
-        ld1             {v1.8h},  [x4],  #16
-        ld1             {v17.8h}, [x12], #16
-        ld1             {v2.8h},  [x5],  #16
-        ld1             {v18.8h}, [x13], #16
-        subs            x6,  x6,  #8
-        ushll           v0.8h,  v0.8b,  #4     // u
-        ushll           v16.8h, v16.8b, #4     // u
-        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
-        sub             v2.8h,  v2.8h,  v0.8h  // t2 - u
-        sub             v17.8h, v17.8h, v16.8h // t1 - u
-        sub             v18.8h, v18.8h, v16.8h // t2 - u
-        ushll           v3.4s,  v0.4h,  #7     // u << 7
-        ushll2          v4.4s,  v0.8h,  #7     // u << 7
-        ushll           v19.4s, v16.4h, #7     // u << 7
-        ushll2          v20.4s, v16.8h, #7     // u << 7
-        smlal           v3.4s,  v1.4h,  v30.4h // wt[0] * (t1 - u)
-        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)
-        smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)
-        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)
-        smlal           v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)
-        smlal           v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
-        smlal2          v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
-        smlal2          v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
-        rshrn           v3.4h,  v3.4s,  #11
-        rshrn2          v3.8h,  v4.4s,  #11
-        rshrn           v19.4h, v19.4s, #11
-        rshrn2          v19.8h, v20.4s, #11
-        sqxtun          v3.8b,  v3.8h
-        sqxtun          v19.8b, v19.8h
-        st1             {v3.8b},  [x0],  #8
-        st1             {v19.8b}, [x10], #8
-        b.gt            1b
-
-        subs            x7,  x7,  #2
-        cmp             x7,  #1
-        b.lt            0f
-        mov             x6,  x9
-        add             x0,  x0,  x1
-        add             x10, x10, x1
-        add             x2,  x2,  x3
-        add             x11, x11, x3
-        add             x4,  x4,  x8
-        add             x12, x12, x8
-        add             x5,  x5,  x8
-        add             x13, x13, x8
-        b.eq            2f
-        b               1b
-
-2:
-        ld1             {v0.8b}, [x2], #8
-        ld1             {v1.8h}, [x4], #16
-        ld1             {v2.8h}, [x5], #16
-        subs            x6,  x6,  #8
-        ushll           v0.8h,  v0.8b,  #4     // u
-        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
-        sub             v2.8h,  v2.8h,  v0.8h  // t2 - u
-        ushll           v3.4s,  v0.4h,  #7     // u << 7
-        ushll2          v4.4s,  v0.8h,  #7     // u << 7
-        smlal           v3.4s,  v1.4h,  v30.4h // wt[0] * (t1 - u)
-        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)
-        smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)
-        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)
-        rshrn           v3.4h,  v3.4s,  #11
-        rshrn2          v3.8h,  v4.4s,  #11
-        sqxtun          v3.8b,  v3.8h
-        st1             {v3.8b}, [x0], #8
-        b.gt            1b
-0:
-        ret
-endfunc
diff --git a/src/arm/64/looprestoration_common.S b/src/arm/64/looprestoration_common.S
new file mode 100644
index 0000000..dc07827
--- /dev/null
+++ b/src/arm/64/looprestoration_common.S
@@ -0,0 +1,422 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define SUM_STRIDE (384+16)
+
+// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+//                            const int w, const int h,
+//                            const enum LrEdgeFlags edges);
+function sgr_box3_v_neon, export=1
+        add             w10, w3,  #2 // Number of output rows to move back
+        mov             w11, w3      // Number of input rows to move back
+        add             w2,  w2,  #2 // Actual summed width
+        mov             x7,       #(4*SUM_STRIDE) // sumsq stride
+        mov             x8,       #(2*SUM_STRIDE) // sum stride
+        sub             x0,  x0,  #(4*SUM_STRIDE) // sumsq -= stride
+        sub             x1,  x1,  #(2*SUM_STRIDE) // sum   -= stride
+
+        tst             w4,  #4 // LR_HAVE_TOP
+        b.eq            0f
+        // If have top, read from row -2.
+        sub             x5,  x0,  #(4*SUM_STRIDE)
+        sub             x6,  x1,  #(2*SUM_STRIDE)
+        add             w11, w11, #2
+        b               1f
+0:
+        // !LR_HAVE_TOP
+        // If we don't have top, read from row 0 even if
+        // we start writing to row -1.
+        add             x5,  x0,  #(4*SUM_STRIDE)
+        add             x6,  x1,  #(2*SUM_STRIDE)
+1:
+
+        tst             w4,  #8 // LR_HAVE_BOTTOM
+        b.eq            1f
+        // LR_HAVE_BOTTOM
+        add             w3,  w3,  #2  // Sum all h+2 lines with the main loop
+        add             w11, w11, #2
+1:
+        mov             w9,  w3       // Backup of h for next loops
+
+1:
+        // Start of horizontal loop; start one vertical filter slice.
+        // Start loading rows into v16-v21 and v24-v26 taking top
+        // padding into consideration.
+        tst             w4,  #4 // LR_HAVE_TOP
+        ld1             {v16.4s, v17.4s}, [x5], x7
+        ld1             {v24.8h},         [x6], x8
+        b.eq            2f
+        // LR_HAVE_TOP
+        ld1             {v18.4s, v19.4s}, [x5], x7
+        ld1             {v25.8h},         [x6], x8
+        ld1             {v20.4s, v21.4s}, [x5], x7
+        ld1             {v26.8h},         [x6], x8
+        b               3f
+2:      // !LR_HAVE_TOP
+        mov             v18.16b, v16.16b
+        mov             v19.16b, v17.16b
+        mov             v25.16b, v24.16b
+        mov             v20.16b, v16.16b
+        mov             v21.16b, v17.16b
+        mov             v26.16b, v24.16b
+
+3:
+        subs            w3,  w3,  #1
+.macro add3
+        add             v16.4s,  v16.4s,  v18.4s
+        add             v17.4s,  v17.4s,  v19.4s
+        add             v24.8h,  v24.8h,  v25.8h
+        add             v16.4s,  v16.4s,  v20.4s
+        add             v17.4s,  v17.4s,  v21.4s
+        add             v24.8h,  v24.8h,  v26.8h
+        st1             {v16.4s, v17.4s}, [x0], x7
+        st1             {v24.8h},         [x1], x8
+.endm
+        add3
+        mov             v16.16b, v18.16b
+        mov             v17.16b, v19.16b
+        mov             v24.16b, v25.16b
+        mov             v18.16b, v20.16b
+        mov             v19.16b, v21.16b
+        mov             v25.16b, v26.16b
+        b.le            4f
+        ld1             {v20.4s, v21.4s}, [x5], x7
+        ld1             {v26.8h},         [x6], x8
+        b               3b
+
+4:
+        tst             w4,  #8 // LR_HAVE_BOTTOM
+        b.ne            5f
+        // !LR_HAVE_BOTTOM
+        // Produce two more rows, extending the already loaded rows.
+        add3
+        mov             v16.16b, v18.16b
+        mov             v17.16b, v19.16b
+        mov             v24.16b, v25.16b
+        add3
+
+5:      // End of one vertical slice.
+        subs            w2,  w2,  #8
+        b.le            0f
+        // Move pointers back up to the top and loop horizontally.
+        // Input pointers
+        msub            x5,  x7,  x11, x5
+        msub            x6,  x8,  x11, x6
+        // Output pointers
+        msub            x0,  x7,  x10, x0
+        msub            x1,  x8,  x10, x1
+        add             x0,  x0,  #32
+        add             x1,  x1,  #16
+        add             x5,  x5,  #32
+        add             x6,  x6,  #16
+        mov             w3,  w9
+        b               1b
+
+0:
+        ret
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+//                            const int w, const int h,
+//                            const enum LrEdgeFlags edges);
+function sgr_box5_v_neon, export=1
+        add             w10, w3,  #2 // Number of output rows to move back
+        mov             w11, w3      // Number of input rows to move back
+        add             w2,  w2,  #8 // Actual summed width
+        mov             x7,       #(4*SUM_STRIDE) // sumsq stride
+        mov             x8,       #(2*SUM_STRIDE) // sum stride
+        sub             x0,  x0,  #(4*SUM_STRIDE) // sumsq -= stride
+        sub             x1,  x1,  #(2*SUM_STRIDE) // sum   -= stride
+
+        tst             w4,  #4 // LR_HAVE_TOP
+        b.eq            0f
+        // If have top, read from row -2.
+        sub             x5,  x0,  #(4*SUM_STRIDE)
+        sub             x6,  x1,  #(2*SUM_STRIDE)
+        add             w11, w11, #2
+        b               1f
+0:
+        // !LR_HAVE_TOP
+        // If we don't have top, read from row 0 even if
+        // we start writing to row -1.
+        add             x5,  x0,  #(4*SUM_STRIDE)
+        add             x6,  x1,  #(2*SUM_STRIDE)
+1:
+
+        tst             w4,  #8 // LR_HAVE_BOTTOM
+        b.eq            0f
+        // LR_HAVE_BOTTOM
+        add             w3,  w3,  #2  // Handle h+2 lines with the main loop
+        add             w11, w11, #2
+        b               1f
+0:
+        // !LR_HAVE_BOTTOM
+        sub             w3,  w3,  #1  // Handle h-1 lines with the main loop
+1:
+        mov             w9,  w3       // Backup of h for next loops
+
+1:
+        // Start of horizontal loop; start one vertical filter slice.
+        // Start loading rows into v16-v25 and v26-v30 taking top
+        // padding into consideration.
+        tst             w4,  #4 // LR_HAVE_TOP
+        ld1             {v16.4s, v17.4s}, [x5], x7
+        ld1             {v26.8h},         [x6], x8
+        b.eq            2f
+        // LR_HAVE_TOP
+        ld1             {v20.4s, v21.4s}, [x5], x7
+        ld1             {v28.8h},         [x6], x8
+        mov             v18.16b, v16.16b
+        mov             v19.16b, v17.16b
+        mov             v27.16b, v26.16b
+        ld1             {v22.4s, v23.4s}, [x5], x7
+        ld1             {v29.8h},         [x6], x8
+        b               3f
+2:      // !LR_HAVE_TOP
+        mov             v18.16b, v16.16b
+        mov             v19.16b, v17.16b
+        mov             v27.16b, v26.16b
+        mov             v20.16b, v16.16b
+        mov             v21.16b, v17.16b
+        mov             v28.16b, v26.16b
+        mov             v22.16b, v16.16b
+        mov             v23.16b, v17.16b
+        mov             v29.16b, v26.16b
+
+3:
+        cbz             w3,  4f
+        ld1             {v24.4s, v25.4s}, [x5], x7
+        ld1             {v30.8h},         [x6], x8
+
+3:
+        // Start of vertical loop
+        subs            w3,  w3,  #2
+.macro add5
+        add             v16.4s,  v16.4s,  v18.4s
+        add             v17.4s,  v17.4s,  v19.4s
+        add             v26.8h,  v26.8h,  v27.8h
+        add             v0.4s,   v20.4s,  v22.4s
+        add             v1.4s,   v21.4s,  v23.4s
+        add             v2.8h,   v28.8h,  v29.8h
+        add             v16.4s,  v16.4s,  v24.4s
+        add             v17.4s,  v17.4s,  v25.4s
+        add             v26.8h,  v26.8h,  v30.8h
+        add             v16.4s,  v16.4s,  v0.4s
+        add             v17.4s,  v17.4s,  v1.4s
+        add             v26.8h,  v26.8h,  v2.8h
+        st1             {v16.4s, v17.4s}, [x0], x7
+        st1             {v26.8h},         [x1], x8
+.endm
+        add5
+.macro shift2
+        mov             v16.16b, v20.16b
+        mov             v17.16b, v21.16b
+        mov             v26.16b, v28.16b
+        mov             v18.16b, v22.16b
+        mov             v19.16b, v23.16b
+        mov             v27.16b, v29.16b
+        mov             v20.16b, v24.16b
+        mov             v21.16b, v25.16b
+        mov             v28.16b, v30.16b
+.endm
+        shift2
+        add             x0,  x0,  x7
+        add             x1,  x1,  x8
+        b.le            5f
+        ld1             {v22.4s, v23.4s}, [x5], x7
+        ld1             {v29.8h},         [x6], x8
+        ld1             {v24.4s, v25.4s}, [x5], x7
+        ld1             {v30.8h},         [x6], x8
+        b               3b
+
+4:
+        // h == 1, !LR_HAVE_BOTTOM.
+        // Pad the last row with the only content row, and add.
+        mov             v24.16b, v22.16b
+        mov             v25.16b, v23.16b
+        mov             v30.16b, v29.16b
+        add5
+        shift2
+        add             x0,  x0,  x7
+        add             x1,  x1,  x8
+        add5
+        b               6f
+
+5:
+        tst             w4,  #8 // LR_HAVE_BOTTOM
+        b.ne            6f
+        // !LR_HAVE_BOTTOM
+        cbnz            w3,  5f
+        // The intended three edge rows left; output the one at h-2 and
+        // the past edge one at h.
+        ld1             {v22.4s, v23.4s}, [x5], x7
+        ld1             {v29.8h},         [x6], x8
+        // Pad the past-edge row from the last content row.
+        mov             v24.16b, v22.16b
+        mov             v25.16b, v23.16b
+        mov             v30.16b, v29.16b
+        add5
+        shift2
+        add             x0,  x0,  x7
+        add             x1,  x1,  x8
+        // The last two rows are already padded properly here.
+        add5
+        b               6f
+
+5:
+        // w3 == -1, two rows left, output one.
+        // Pad the last two rows from the mid one.
+        mov             v22.16b, v20.16b
+        mov             v23.16b, v21.16b
+        mov             v29.16b, v28.16b
+        mov             v24.16b, v20.16b
+        mov             v25.16b, v21.16b
+        mov             v30.16b, v28.16b
+        add5
+        add             x0,  x0,  x7
+        add             x1,  x1,  x8
+        b               6f
+
+6:      // End of one vertical slice.
+        subs            w2,  w2,  #8
+        b.le            0f
+        // Move pointers back up to the top and loop horizontally.
+        // Input pointers
+        msub            x5,  x7,  x11, x5
+        msub            x6,  x8,  x11, x6
+        // Output pointers
+        msub            x0,  x7,  x10, x0
+        msub            x1,  x8,  x10, x1
+        add             x0,  x0,  #32
+        add             x1,  x1,  #16
+        add             x5,  x5,  #32
+        add             x6,  x6,  #16
+        mov             w3,  w9
+        b               1b
+
+0:
+        ret
+.purgem add5
+endfunc
+
+// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+//                              const int w, const int h, const int strength);
+// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+//                              const int w, const int h, const int strength);
+function sgr_calc_ab1_neon, export=1
+        add             x3,  x3,  #2 // h += 2
+        movi            v31.4s,   #9 // n
+        mov             x5,  #455
+        mov             x8,  #SUM_STRIDE
+        b               sgr_calc_ab_neon
+endfunc
+
+function sgr_calc_ab2_neon, export=1
+        add             x3,  x3,  #3  // h += 3
+        asr             x3,  x3,  #1  // h /= 2
+        movi            v31.4s,   #25 // n
+        mov             x5,  #164
+        mov             x8,  #(2*SUM_STRIDE)
+endfunc
+
+function sgr_calc_ab_neon
+        movrel          x12, X(sgr_x_by_x)
+        ld1             {v16.16b, v17.16b, v18.16b}, [x12]
+        movi            v19.16b,  #5
+        movi            v20.8b,   #55  // idx of last 5
+        movi            v21.8b,   #72  // idx of last 4
+        movi            v22.8b,   #101 // idx of last 3
+        movi            v23.8b,   #169 // idx of last 2
+        movi            v24.8b,   #254 // idx of last 1
+        add             x2,  x2,  #2 // w += 2
+        add             x7,  x2,  #7
+        bic             x7,  x7,  #7 // aligned w
+        sub             x7,  x8,  x7 // increment between rows
+        movi            v29.8h,   #1, lsl #8
+        dup             v28.4s,   w4
+        dup             v30.4s,   w5 // one_by_x
+        sub             x0,  x0,  #(4*(SUM_STRIDE))
+        sub             x1,  x1,  #(2*(SUM_STRIDE))
+        mov             x6,  x2   // backup of w
+        sub             v16.16b, v16.16b, v19.16b
+        sub             v17.16b, v17.16b, v19.16b
+        sub             v18.16b, v18.16b, v19.16b
+1:
+        subs            x2,  x2,  #8
+        ld1             {v0.4s, v1.4s}, [x0]   // a
+        ld1             {v2.8h}, [x1]          // b
+        mul             v0.4s,  v0.4s,  v31.4s // a * n
+        mul             v1.4s,  v1.4s,  v31.4s // a * n
+        umull           v3.4s,  v2.4h,  v2.4h  // b * b
+        umull2          v4.4s,  v2.8h,  v2.8h  // b * b
+        uqsub           v0.4s,  v0.4s,  v3.4s  // imax(a * n - b * b, 0)
+        uqsub           v1.4s,  v1.4s,  v4.4s  // imax(a * n - b * b, 0)
+        mul             v0.4s,  v0.4s,  v28.4s // p * s
+        mul             v1.4s,  v1.4s,  v28.4s // p * s
+        uqshrn          v0.4h,  v0.4s,  #16
+        uqshrn2         v0.8h,  v1.4s,  #16
+        uqrshrn         v0.8b,  v0.8h,  #4     // imin(z, 255)
+
+        cmhi            v25.8b, v0.8b,  v20.8b // = -1 if sgr_x_by_x[v0] < 5
+        cmhi            v26.8b, v0.8b,  v21.8b // = -1 if sgr_x_by_x[v0] < 4
+        tbl             v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
+        cmhi            v27.8b, v0.8b,  v22.8b // = -1 if sgr_x_by_x[v0] < 3
+        cmhi            v5.8b,  v0.8b,  v23.8b // = -1 if sgr_x_by_x[v0] < 2
+        add             v25.8b, v25.8b, v26.8b
+        cmhi            v6.8b,  v0.8b,  v24.8b // = -1 if sgr_x_by_x[v0] < 1
+        add             v27.8b, v27.8b, v5.8b
+        add             v6.8b,  v6.8b,  v19.8b
+        add             v25.8b, v25.8b, v27.8b
+        add             v1.8b,  v1.8b,  v6.8b
+        add             v1.8b,  v1.8b,  v25.8b
+        uxtl            v1.8h,  v1.8b          // x
+
+        umull           v3.4s,  v1.4h,  v2.4h  // x * BB[i]
+        umull2          v4.4s,  v1.8h,  v2.8h  // x * BB[i]
+        mul             v3.4s,  v3.4s,  v30.4s // x * BB[i] * sgr_one_by_x
+        mul             v4.4s,  v4.4s,  v30.4s // x * BB[i] * sgr_one_by_x
+        srshr           v3.4s,  v3.4s,  #12    // AA[i]
+        srshr           v4.4s,  v4.4s,  #12    // AA[i]
+        sub             v2.8h,  v29.8h, v1.8h  // 256 - x
+
+        st1             {v3.4s, v4.4s}, [x0], #32
+        st1             {v2.8h}, [x1], #16
+        b.gt            1b
+
+        subs            x3,  x3,  #1
+        b.le            0f
+        add             x0,  x0,  x7, lsl #2
+        add             x1,  x1,  x7, lsl #1
+        mov             x2,  x6
+        b               1b
+0:
+        ret
+endfunc
diff --git a/src/arm/64/looprestoration_tmpl.S b/src/arm/64/looprestoration_tmpl.S
new file mode 100644
index 0000000..27c952d
--- /dev/null
+++ b/src/arm/64/looprestoration_tmpl.S
@@ -0,0 +1,474 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+#define FILTER_OUT_STRIDE 384
+
+// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,
+//                                         const pixel *src, const ptrdiff_t stride,
+//                                         const int32_t *a, const int16_t *b,
+//                                         const int w, const int h);
+function sgr_finish_filter1_8bpc_neon, export=1
+        sub             x7,  x3,  #(4*SUM_STRIDE)
+        add             x8,  x3,  #(4*SUM_STRIDE)
+        sub             x9,  x4,  #(2*SUM_STRIDE)
+        add             x10, x4,  #(2*SUM_STRIDE)
+        mov             x11, #SUM_STRIDE
+        mov             x12, #FILTER_OUT_STRIDE
+        add             x13, x5,  #7
+        bic             x13, x13, #7 // Aligned width
+        sub             x2,  x2,  x13
+        sub             x12, x12, x13
+        sub             x11, x11, x13
+        sub             x11, x11, #4 // We read 4 extra elements from a
+        sub             x14, x11, #4 // We read 8 extra elements from b
+        mov             x13, x5
+        movi            v6.8h,  #3
+        movi            v7.4s,  #3
+1:
+        ld1             {v0.8h, v1.8h}, [x9], #32
+        ld1             {v2.8h, v3.8h}, [x4], #32
+        ld1             {v4.8h, v5.8h}, [x10], #32
+        ld1             {v16.4s, v17.4s, v18.4s}, [x7], #48
+        ld1             {v19.4s, v20.4s, v21.4s}, [x3], #48
+        ld1             {v22.4s, v23.4s, v24.4s}, [x8], #48
+
+2:
+        subs            x5,  x5,  #8
+        ext             v25.16b, v0.16b,  v1.16b, #2  // -stride
+        ext             v26.16b, v2.16b,  v3.16b, #2  // 0
+        ext             v27.16b, v4.16b,  v5.16b, #2  // +stride
+        ext             v28.16b, v0.16b,  v1.16b, #4  // +1-stride
+        ext             v29.16b, v2.16b,  v3.16b, #4  // +1
+        ext             v30.16b, v4.16b,  v5.16b, #4  // +1+stride
+        add             v2.8h,   v2.8h,   v25.8h      // -1, -stride
+        add             v26.8h,  v26.8h,  v27.8h      // 0, +stride
+        add             v0.8h,   v0.8h,   v28.8h      // -1-stride, +1-stride
+        add             v2.8h,   v2.8h,   v26.8h
+        add             v4.8h,   v4.8h,   v30.8h      // -1+stride, +1+stride
+        add             v2.8h,   v2.8h,   v29.8h      // +1
+        add             v0.8h,   v0.8h,   v4.8h
+
+        ext             v25.16b, v16.16b, v17.16b, #4 // -stride
+        ext             v26.16b, v17.16b, v18.16b, #4
+        shl             v2.8h,   v2.8h,   #2
+        ext             v27.16b, v16.16b, v17.16b, #8 // +1-stride
+        ext             v28.16b, v17.16b, v18.16b, #8
+        ext             v29.16b, v19.16b, v20.16b, #4 // 0
+        ext             v30.16b, v20.16b, v21.16b, #4
+        mla             v2.8h,   v0.8h,   v6.8h       // * 3 -> a
+        add             v25.4s,  v25.4s,  v19.4s      // -stride, -1
+        add             v26.4s,  v26.4s,  v20.4s
+        add             v16.4s,  v16.4s,  v27.4s      // -1-stride, +1-stride
+        add             v17.4s,  v17.4s,  v28.4s
+        ext             v27.16b, v19.16b, v20.16b, #8 // +1
+        ext             v28.16b, v20.16b, v21.16b, #8
+        add             v16.4s,  v16.4s,  v22.4s      // -1+stride
+        add             v17.4s,  v17.4s,  v23.4s
+        add             v29.4s,  v29.4s,  v27.4s      // 0, +1
+        add             v30.4s,  v30.4s,  v28.4s
+        add             v25.4s,  v25.4s,  v29.4s
+        add             v26.4s,  v26.4s,  v30.4s
+        ext             v27.16b, v22.16b, v23.16b, #4 // +stride
+        ext             v28.16b, v23.16b, v24.16b, #4
+        ext             v29.16b, v22.16b, v23.16b, #8 // +1+stride
+        ext             v30.16b, v23.16b, v24.16b, #8
+        ld1             {v19.8b}, [x1], #8            // src
+        add             v25.4s,  v25.4s,  v27.4s      // +stride
+        add             v26.4s,  v26.4s,  v28.4s
+        add             v16.4s,  v16.4s,  v29.4s      // +1+stride
+        add             v17.4s,  v17.4s,  v30.4s
+        shl             v25.4s,  v25.4s,  #2
+        shl             v26.4s,  v26.4s,  #2
+        mla             v25.4s,  v16.4s,  v7.4s       // * 3 -> b
+        mla             v26.4s,  v17.4s,  v7.4s
+        uxtl            v19.8h,  v19.8b               // src
+        mov             v0.16b,  v1.16b
+        umlal           v25.4s,  v2.4h,   v19.4h      // b + a * src
+        umlal2          v26.4s,  v2.8h,   v19.8h
+        mov             v2.16b,  v3.16b
+        rshrn           v25.4h,  v25.4s,  #9
+        rshrn2          v25.8h,  v26.4s,  #9
+        mov             v4.16b,  v5.16b
+        st1             {v25.8h}, [x0], #16
+
+        b.le            3f
+        mov             v16.16b, v18.16b
+        mov             v19.16b, v21.16b
+        mov             v22.16b, v24.16b
+        ld1             {v1.8h}, [x9], #16
+        ld1             {v3.8h}, [x4], #16
+        ld1             {v5.8h}, [x10], #16
+        ld1             {v17.4s, v18.4s}, [x7], #32
+        ld1             {v20.4s, v21.4s}, [x3], #32
+        ld1             {v23.4s, v24.4s}, [x8], #32
+        b               2b
+
+3:
+        subs            x6,  x6,  #1
+        b.le            0f
+        mov             x5,  x13
+        add             x0,  x0,  x12, lsl #1
+        add             x1,  x1,  x2
+        add             x3,  x3,  x11, lsl #2
+        add             x7,  x7,  x11, lsl #2
+        add             x8,  x8,  x11, lsl #2
+        add             x4,  x4,  x14, lsl #1
+        add             x9,  x9,  x14, lsl #1
+        add             x10, x10, x14, lsl #1
+        b               1b
+0:
+        ret
+endfunc
+
+// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,
+//                                         const pixel *src, const ptrdiff_t stride,
+//                                         const int32_t *a, const int16_t *b,
+//                                         const int w, const int h);
+function sgr_finish_filter2_8bpc_neon, export=1
+        add             x7,  x3,  #(4*(SUM_STRIDE))
+        sub             x3,  x3,  #(4*(SUM_STRIDE))
+        add             x8,  x4,  #(2*(SUM_STRIDE))
+        sub             x4,  x4,  #(2*(SUM_STRIDE))
+        mov             x9,  #(2*SUM_STRIDE)
+        mov             x10, #FILTER_OUT_STRIDE
+        add             x11, x5,  #7
+        bic             x11, x11, #7 // Aligned width
+        sub             x2,  x2,  x11
+        sub             x10, x10, x11
+        sub             x9,  x9,  x11
+        sub             x9,  x9,  #4 // We read 4 extra elements from a
+        sub             x12, x9,  #4 // We read 8 extra elements from b
+        mov             x11, x5
+        movi            v4.8h,  #5
+        movi            v5.4s,  #5
+        movi            v6.8h,  #6
+        movi            v7.4s,  #6
+1:
+        ld1             {v0.8h, v1.8h}, [x4], #32
+        ld1             {v2.8h, v3.8h}, [x8], #32
+        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48
+        ld1             {v19.4s, v20.4s, v21.4s}, [x7], #48
+
+2:
+        subs            x5,  x5,  #8
+        ext             v24.16b, v0.16b,  v1.16b, #4  // +1-stride
+        ext             v25.16b, v2.16b,  v3.16b, #4  // +1+stride
+        ext             v22.16b, v0.16b,  v1.16b, #2  // -stride
+        ext             v23.16b, v2.16b,  v3.16b, #2  // +stride
+        add             v0.8h,   v0.8h,   v24.8h      // -1-stride, +1-stride
+        add             v25.8h,  v2.8h,   v25.8h      // -1+stride, +1+stride
+        add             v2.8h,   v22.8h,  v23.8h      // -stride, +stride
+        add             v0.8h,   v0.8h,   v25.8h
+
+        ext             v22.16b, v16.16b, v17.16b, #4 // -stride
+        ext             v23.16b, v17.16b, v18.16b, #4
+        ext             v24.16b, v19.16b, v20.16b, #4 // +stride
+        ext             v25.16b, v20.16b, v21.16b, #4
+        ext             v26.16b, v16.16b, v17.16b, #8 // +1-stride
+        ext             v27.16b, v17.16b, v18.16b, #8
+        ext             v28.16b, v19.16b, v20.16b, #8 // +1+stride
+        ext             v29.16b, v20.16b, v21.16b, #8
+        mul             v0.8h,   v0.8h,   v4.8h       // * 5
+        mla             v0.8h,   v2.8h,   v6.8h       // * 6
+        ld1             {v31.8b}, [x1], #8
+        add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride
+        add             v17.4s,  v17.4s,  v27.4s
+        add             v19.4s,  v19.4s,  v28.4s      // -1+stride, +1+stride
+        add             v20.4s,  v20.4s,  v29.4s
+        add             v16.4s,  v16.4s,  v19.4s
+        add             v17.4s,  v17.4s,  v20.4s
+
+        add             v22.4s,  v22.4s,  v24.4s      // -stride, +stride
+        add             v23.4s,  v23.4s,  v25.4s
+        // This is, surprisingly, faster than other variants where the
+        // mul+mla pairs are further apart, on Cortex A53.
+        mul             v16.4s,  v16.4s,  v5.4s       // * 5
+        mla             v16.4s,  v22.4s,  v7.4s       // * 6
+        mul             v17.4s,  v17.4s,  v5.4s       // * 5
+        mla             v17.4s,  v23.4s,  v7.4s       // * 6
+
+        uxtl            v31.8h,  v31.8b
+        umlal           v16.4s,  v0.4h,   v31.4h      // b + a * src
+        umlal2          v17.4s,  v0.8h,   v31.8h
+        mov             v0.16b,  v1.16b
+        rshrn           v16.4h,  v16.4s,  #9
+        rshrn2          v16.8h,  v17.4s,  #9
+        mov             v2.16b,  v3.16b
+        st1             {v16.8h}, [x0], #16
+
+        b.le            3f
+        mov             v16.16b, v18.16b
+        mov             v19.16b, v21.16b
+        ld1             {v1.8h}, [x4], #16
+        ld1             {v3.8h}, [x8], #16
+        ld1             {v17.4s, v18.4s}, [x3], #32
+        ld1             {v20.4s, v21.4s}, [x7], #32
+        b               2b
+
+3:
+        subs            x6,  x6,  #1
+        b.le            0f
+        mov             x5,  x11
+        add             x0,  x0,  x10, lsl #1
+        add             x1,  x1,  x2
+        add             x3,  x3,  x9, lsl #2
+        add             x7,  x7,  x9, lsl #2
+        add             x4,  x4,  x12, lsl #1
+        add             x8,  x8,  x12, lsl #1
+        mov             x13, x3
+        mov             x14, x4
+
+        ld1             {v0.8h, v1.8h}, [x4], #32
+        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48
+
+4:
+        subs            x5,  x5,  #8
+        ext             v23.16b, v0.16b,  v1.16b, #4  // +1
+        ext             v22.16b, v0.16b,  v1.16b, #2  // 0
+        add             v0.8h,   v0.8h,   v23.8h      // -1, +1
+
+        ext             v24.16b, v16.16b, v17.16b, #4 // 0
+        ext             v25.16b, v17.16b, v18.16b, #4
+        ext             v26.16b, v16.16b, v17.16b, #8 // +1
+        ext             v27.16b, v17.16b, v18.16b, #8
+        mul             v2.8h,   v22.8h,  v6.8h       // * 6
+        mla             v2.8h,   v0.8h,   v4.8h       // * 5 -> a
+        ld1             {v31.8b}, [x1], #8
+        add             v16.4s,  v16.4s,  v26.4s      // -1, +1
+        add             v17.4s,  v17.4s,  v27.4s
+        uxtl            v31.8h,  v31.8b
+        // This is, surprisingly, faster than other variants where the
+        // mul+mla pairs are further apart, on Cortex A53.
+        mul             v24.4s,  v24.4s,  v7.4s       // * 6
+        mla             v24.4s,  v16.4s,  v5.4s       // * 5 -> b
+        mul             v25.4s,  v25.4s,  v7.4s       // * 6
+        mla             v25.4s,  v17.4s,  v5.4s       // * 5 -> b
+
+        umlal           v24.4s,  v2.4h,   v31.4h      // b + a * src
+        umlal2          v25.4s,  v2.8h,   v31.8h
+        mov             v0.16b,  v1.16b
+        rshrn           v24.4h,  v24.4s,  #8
+        rshrn2          v24.8h,  v25.4s,  #8
+        mov             v16.16b, v18.16b
+        st1             {v24.8h}, [x0], #16
+
+        b.le            5f
+        ld1             {v1.8h}, [x4], #16
+        ld1             {v17.4s, v18.4s}, [x3], #32
+        b               4b
+
+5:
+        subs            x6,  x6,  #1
+        b.le            0f
+        mov             x5,  x11
+        add             x0,  x0,  x10, lsl #1
+        add             x1,  x1,  x2
+        mov             x3,  x13 // Rewind x3/x4 to where they started
+        mov             x4,  x14
+        b               1b
+0:
+        ret
+endfunc
+
+// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,
+//                                    const pixel *src, const ptrdiff_t src_stride,
+//                                    const int16_t *t1, const int w, const int h,
+//                                    const int wt);
+function sgr_weighted1_8bpc_neon, export=1
+        dup             v31.8h, w7
+        cmp             x6,  #2
+        add             x9,  x0,  x1
+        add             x10, x2,  x3
+        add             x11, x4,  #2*FILTER_OUT_STRIDE
+        mov             x7,  #(4*FILTER_OUT_STRIDE)
+        lsl             x1,  x1,  #1
+        lsl             x3,  x3,  #1
+        add             x8,  x5,  #7
+        bic             x8,  x8,  #7 // Aligned width
+        sub             x1,  x1,  x8
+        sub             x3,  x3,  x8
+        sub             x7,  x7,  x8, lsl #1
+        mov             x8,  x5
+        b.lt            2f
+1:
+        ld1             {v0.8b}, [x2],  #8
+        ld1             {v4.8b}, [x10], #8
+        ld1             {v1.8h}, [x4],  #16
+        ld1             {v5.8h}, [x11], #16
+        subs            x5,  x5,  #8
+        ushll           v0.8h,  v0.8b,  #4     // u
+        ushll           v4.8h,  v4.8b,  #4     // u
+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
+        sub             v5.8h,  v5.8h,  v4.8h  // t1 - u
+        ushll           v2.4s,  v0.4h,  #7     // u << 7
+        ushll2          v3.4s,  v0.8h,  #7     // u << 7
+        ushll           v6.4s,  v4.4h,  #7     // u << 7
+        ushll2          v7.4s,  v4.8h,  #7     // u << 7
+        smlal           v2.4s,  v1.4h,  v31.4h // v
+        smlal2          v3.4s,  v1.8h,  v31.8h // v
+        smlal           v6.4s,  v5.4h,  v31.4h // v
+        smlal2          v7.4s,  v5.8h,  v31.8h // v
+        rshrn           v2.4h,  v2.4s,  #11
+        rshrn2          v2.8h,  v3.4s,  #11
+        rshrn           v6.4h,  v6.4s,  #11
+        rshrn2          v6.8h,  v7.4s,  #11
+        sqxtun          v2.8b,  v2.8h
+        sqxtun          v6.8b,  v6.8h
+        st1             {v2.8b}, [x0], #8
+        st1             {v6.8b}, [x9], #8
+        b.gt            1b
+
+        sub             x6,  x6,  #2
+        cmp             x6,  #1
+        b.lt            0f
+        mov             x5,  x8
+        add             x0,  x0,  x1
+        add             x9,  x9,  x1
+        add             x2,  x2,  x3
+        add             x10, x10, x3
+        add             x4,  x4,  x7
+        add             x11, x11, x7
+        b.eq            2f
+        b               1b
+
+2:
+        ld1             {v0.8b}, [x2], #8
+        ld1             {v1.8h}, [x4], #16
+        subs            x5,  x5,  #8
+        ushll           v0.8h,  v0.8b,  #4     // u
+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
+        ushll           v2.4s,  v0.4h,  #7     // u << 7
+        ushll2          v3.4s,  v0.8h,  #7     // u << 7
+        smlal           v2.4s,  v1.4h,  v31.4h // v
+        smlal2          v3.4s,  v1.8h,  v31.8h // v
+        rshrn           v2.4h,  v2.4s,  #11
+        rshrn2          v2.8h,  v3.4s,  #11
+        sqxtun          v2.8b,  v2.8h
+        st1             {v2.8b}, [x0], #8
+        b.gt            2b
+0:
+        ret
+endfunc
+
+// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                    const pixel *src, const ptrdiff_t src_stride,
+//                                    const int16_t *t1, const int16_t *t2,
+//                                    const int w, const int h,
+//                                    const int16_t wt[2]);
+function sgr_weighted2_8bpc_neon, export=1
+        ldr             x8,  [sp]
+        cmp             x7,  #2
+        add             x10, x0,  x1
+        add             x11, x2,  x3
+        add             x12, x4,  #2*FILTER_OUT_STRIDE
+        add             x13, x5,  #2*FILTER_OUT_STRIDE
+        ld2r            {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
+        mov             x8,  #4*FILTER_OUT_STRIDE
+        lsl             x1,  x1,  #1
+        lsl             x3,  x3,  #1
+        add             x9,  x6,  #7
+        bic             x9,  x9,  #7 // Aligned width
+        sub             x1,  x1,  x9
+        sub             x3,  x3,  x9
+        sub             x8,  x8,  x9, lsl #1
+        mov             x9,  x6
+        b.lt            2f
+1:
+        ld1             {v0.8b},  [x2],  #8
+        ld1             {v16.8b}, [x11], #8
+        ld1             {v1.8h},  [x4],  #16
+        ld1             {v17.8h}, [x12], #16
+        ld1             {v2.8h},  [x5],  #16
+        ld1             {v18.8h}, [x13], #16
+        subs            x6,  x6,  #8
+        ushll           v0.8h,  v0.8b,  #4     // u
+        ushll           v16.8h, v16.8b, #4     // u
+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
+        sub             v2.8h,  v2.8h,  v0.8h  // t2 - u
+        sub             v17.8h, v17.8h, v16.8h // t1 - u
+        sub             v18.8h, v18.8h, v16.8h // t2 - u
+        ushll           v3.4s,  v0.4h,  #7     // u << 7
+        ushll2          v4.4s,  v0.8h,  #7     // u << 7
+        ushll           v19.4s, v16.4h, #7     // u << 7
+        ushll2          v20.4s, v16.8h, #7     // u << 7
+        smlal           v3.4s,  v1.4h,  v30.4h // wt[0] * (t1 - u)
+        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)
+        smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)
+        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)
+        smlal           v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)
+        smlal           v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)
+        smlal2          v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)
+        smlal2          v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)
+        rshrn           v3.4h,  v3.4s,  #11
+        rshrn2          v3.8h,  v4.4s,  #11
+        rshrn           v19.4h, v19.4s, #11
+        rshrn2          v19.8h, v20.4s, #11
+        sqxtun          v3.8b,  v3.8h
+        sqxtun          v19.8b, v19.8h
+        st1             {v3.8b},  [x0],  #8
+        st1             {v19.8b}, [x10], #8
+        b.gt            1b
+
+        subs            x7,  x7,  #2
+        cmp             x7,  #1
+        b.lt            0f
+        mov             x6,  x9
+        add             x0,  x0,  x1
+        add             x10, x10, x1
+        add             x2,  x2,  x3
+        add             x11, x11, x3
+        add             x4,  x4,  x8
+        add             x12, x12, x8
+        add             x5,  x5,  x8
+        add             x13, x13, x8
+        b.eq            2f
+        b               1b
+
+2:
+        ld1             {v0.8b}, [x2], #8
+        ld1             {v1.8h}, [x4], #16
+        ld1             {v2.8h}, [x5], #16
+        subs            x6,  x6,  #8
+        ushll           v0.8h,  v0.8b,  #4     // u
+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u
+        sub             v2.8h,  v2.8h,  v0.8h  // t2 - u
+        ushll           v3.4s,  v0.4h,  #7     // u << 7
+        ushll2          v4.4s,  v0.8h,  #7     // u << 7
+        smlal           v3.4s,  v1.4h,  v30.4h // wt[0] * (t1 - u)
+        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)
+        smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)
+        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)
+        rshrn           v3.4h,  v3.4s,  #11
+        rshrn2          v3.8h,  v4.4s,  #11
+        sqxtun          v3.8b,  v3.8h
+        st1             {v3.8b}, [x0], #8
+        b.gt            1b
+0:
+        ret
+endfunc
author	Martin Storsjö <martin@martin.st>	2020-02-10 00:39:11 +0300
committer	Martin Storsjö <martin@martin.st>	2020-02-11 11:43:43 +0300
commit	7cf5d7535f44d7c2d00e368575d0d26b66c73121 (patch)
tree	a1ed49b4da47a0382402e6c141b57e3e65437276 /src/arm
parent	32e265a86e535b5fad47bcac9b54f83e1e5eab33 (diff)