diff options
author | Martin Storsjö <martin@martin.st> | 2020-02-10 00:39:11 +0300 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2020-02-11 11:43:43 +0300 |
commit | 7cf5d7535f44d7c2d00e368575d0d26b66c73121 (patch) | |
tree | a1ed49b4da47a0382402e6c141b57e3e65437276 /src/arm | |
parent | 32e265a86e535b5fad47bcac9b54f83e1e5eab33 (diff) |
arm64: looprestoration: Prepare for 16 bpc by splitting code to separate files
looprestoration_common.S contains functions that can be used as is
with one single instantiation of the functions for both 8 and 16 bpc.
This file will be built once, regardless of which bitdepths are enabled.
looprestoration_tmpl.S contains functions where the source can be shared
and templated between 8 and 16 bpc. This will be included by the separate
8/16bpc implementaton files.
Diffstat (limited to 'src/arm')
-rw-r--r-- | src/arm/64/looprestoration.S | 839 | ||||
-rw-r--r-- | src/arm/64/looprestoration_common.S | 422 | ||||
-rw-r--r-- | src/arm/64/looprestoration_tmpl.S | 474 |
3 files changed, 898 insertions, 837 deletions
diff --git a/src/arm/64/looprestoration.S b/src/arm/64/looprestoration.S index 8a6bfab..b6f0934 100644 --- a/src/arm/64/looprestoration.S +++ b/src/arm/64/looprestoration.S @@ -617,6 +617,8 @@ endfunc #define SUM_STRIDE (384+16) +#include "looprestoration_tmpl.S" + // void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum, // const pixel (*left)[4], // const pixel *src, const ptrdiff_t stride, @@ -1146,840 +1148,3 @@ L(box5_variable_shift_tbl): ret .purgem add5 endfunc - -// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, -// const int w, const int h, -// const enum LrEdgeFlags edges); -function sgr_box3_v_neon, export=1 - add w10, w3, #2 // Number of output rows to move back - mov w11, w3 // Number of input rows to move back - add w2, w2, #2 // Actual summed width - mov x7, #(4*SUM_STRIDE) // sumsq stride - mov x8, #(2*SUM_STRIDE) // sum stride - sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride - sub x1, x1, #(2*SUM_STRIDE) // sum -= stride - - tst w4, #4 // LR_HAVE_TOP - b.eq 0f - // If have top, read from row -2. - sub x5, x0, #(4*SUM_STRIDE) - sub x6, x1, #(2*SUM_STRIDE) - add w11, w11, #2 - b 1f -0: - // !LR_HAVE_TOP - // If we don't have top, read from row 0 even if - // we start writing to row -1. - add x5, x0, #(4*SUM_STRIDE) - add x6, x1, #(2*SUM_STRIDE) -1: - - tst w4, #8 // LR_HAVE_BOTTOM - b.eq 1f - // LR_HAVE_BOTTOM - add w3, w3, #2 // Sum all h+2 lines with the main loop - add w11, w11, #2 -1: - mov w9, w3 // Backup of h for next loops - -1: - // Start of horizontal loop; start one vertical filter slice. - // Start loading rows into v16-v21 and v24-v26 taking top - // padding into consideration. - tst w4, #4 // LR_HAVE_TOP - ld1 {v16.4s, v17.4s}, [x5], x7 - ld1 {v24.8h}, [x6], x8 - b.eq 2f - // LR_HAVE_TOP - ld1 {v18.4s, v19.4s}, [x5], x7 - ld1 {v25.8h}, [x6], x8 - ld1 {v20.4s, v21.4s}, [x5], x7 - ld1 {v26.8h}, [x6], x8 - b 3f -2: // !LR_HAVE_TOP - mov v18.16b, v16.16b - mov v19.16b, v17.16b - mov v25.16b, v24.16b - mov v20.16b, v16.16b - mov v21.16b, v17.16b - mov v26.16b, v24.16b - -3: - subs w3, w3, #1 -.macro add3 - add v16.4s, v16.4s, v18.4s - add v17.4s, v17.4s, v19.4s - add v24.8h, v24.8h, v25.8h - add v16.4s, v16.4s, v20.4s - add v17.4s, v17.4s, v21.4s - add v24.8h, v24.8h, v26.8h - st1 {v16.4s, v17.4s}, [x0], x7 - st1 {v24.8h}, [x1], x8 -.endm - add3 - mov v16.16b, v18.16b - mov v17.16b, v19.16b - mov v24.16b, v25.16b - mov v18.16b, v20.16b - mov v19.16b, v21.16b - mov v25.16b, v26.16b - b.le 4f - ld1 {v20.4s, v21.4s}, [x5], x7 - ld1 {v26.8h}, [x6], x8 - b 3b - -4: - tst w4, #8 // LR_HAVE_BOTTOM - b.ne 5f - // !LR_HAVE_BOTTOM - // Produce two more rows, extending the already loaded rows. - add3 - mov v16.16b, v18.16b - mov v17.16b, v19.16b - mov v24.16b, v25.16b - add3 - -5: // End of one vertical slice. - subs w2, w2, #8 - b.le 0f - // Move pointers back up to the top and loop horizontally. - // Input pointers - msub x5, x7, x11, x5 - msub x6, x8, x11, x6 - // Output pointers - msub x0, x7, x10, x0 - msub x1, x8, x10, x1 - add x0, x0, #32 - add x1, x1, #16 - add x5, x5, #32 - add x6, x6, #16 - mov w3, w9 - b 1b - -0: - ret -.purgem add3 -endfunc - -// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, -// const int w, const int h, -// const enum LrEdgeFlags edges); -function sgr_box5_v_neon, export=1 - add w10, w3, #2 // Number of output rows to move back - mov w11, w3 // Number of input rows to move back - add w2, w2, #8 // Actual summed width - mov x7, #(4*SUM_STRIDE) // sumsq stride - mov x8, #(2*SUM_STRIDE) // sum stride - sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride - sub x1, x1, #(2*SUM_STRIDE) // sum -= stride - - tst w4, #4 // LR_HAVE_TOP - b.eq 0f - // If have top, read from row -2. - sub x5, x0, #(4*SUM_STRIDE) - sub x6, x1, #(2*SUM_STRIDE) - add w11, w11, #2 - b 1f -0: - // !LR_HAVE_TOP - // If we don't have top, read from row 0 even if - // we start writing to row -1. - add x5, x0, #(4*SUM_STRIDE) - add x6, x1, #(2*SUM_STRIDE) -1: - - tst w4, #8 // LR_HAVE_BOTTOM - b.eq 0f - // LR_HAVE_BOTTOM - add w3, w3, #2 // Handle h+2 lines with the main loop - add w11, w11, #2 - b 1f -0: - // !LR_HAVE_BOTTOM - sub w3, w3, #1 // Handle h-1 lines with the main loop -1: - mov w9, w3 // Backup of h for next loops - -1: - // Start of horizontal loop; start one vertical filter slice. - // Start loading rows into v16-v25 and v26-v30 taking top - // padding into consideration. - tst w4, #4 // LR_HAVE_TOP - ld1 {v16.4s, v17.4s}, [x5], x7 - ld1 {v26.8h}, [x6], x8 - b.eq 2f - // LR_HAVE_TOP - ld1 {v20.4s, v21.4s}, [x5], x7 - ld1 {v28.8h}, [x6], x8 - mov v18.16b, v16.16b - mov v19.16b, v17.16b - mov v27.16b, v26.16b - ld1 {v22.4s, v23.4s}, [x5], x7 - ld1 {v29.8h}, [x6], x8 - b 3f -2: // !LR_HAVE_TOP - mov v18.16b, v16.16b - mov v19.16b, v17.16b - mov v27.16b, v26.16b - mov v20.16b, v16.16b - mov v21.16b, v17.16b - mov v28.16b, v26.16b - mov v22.16b, v16.16b - mov v23.16b, v17.16b - mov v29.16b, v26.16b - -3: - cbz w3, 4f - ld1 {v24.4s, v25.4s}, [x5], x7 - ld1 {v30.8h}, [x6], x8 - -3: - // Start of vertical loop - subs w3, w3, #2 -.macro add5 - add v16.4s, v16.4s, v18.4s - add v17.4s, v17.4s, v19.4s - add v26.8h, v26.8h, v27.8h - add v0.4s, v20.4s, v22.4s - add v1.4s, v21.4s, v23.4s - add v2.8h, v28.8h, v29.8h - add v16.4s, v16.4s, v24.4s - add v17.4s, v17.4s, v25.4s - add v26.8h, v26.8h, v30.8h - add v16.4s, v16.4s, v0.4s - add v17.4s, v17.4s, v1.4s - add v26.8h, v26.8h, v2.8h - st1 {v16.4s, v17.4s}, [x0], x7 - st1 {v26.8h}, [x1], x8 -.endm - add5 -.macro shift2 - mov v16.16b, v20.16b - mov v17.16b, v21.16b - mov v26.16b, v28.16b - mov v18.16b, v22.16b - mov v19.16b, v23.16b - mov v27.16b, v29.16b - mov v20.16b, v24.16b - mov v21.16b, v25.16b - mov v28.16b, v30.16b -.endm - shift2 - add x0, x0, x7 - add x1, x1, x8 - b.le 5f - ld1 {v22.4s, v23.4s}, [x5], x7 - ld1 {v29.8h}, [x6], x8 - ld1 {v24.4s, v25.4s}, [x5], x7 - ld1 {v30.8h}, [x6], x8 - b 3b - -4: - // h == 1, !LR_HAVE_BOTTOM. - // Pad the last row with the only content row, and add. - mov v24.16b, v22.16b - mov v25.16b, v23.16b - mov v30.16b, v29.16b - add5 - shift2 - add x0, x0, x7 - add x1, x1, x8 - add5 - b 6f - -5: - tst w4, #8 // LR_HAVE_BOTTOM - b.ne 6f - // !LR_HAVE_BOTTOM - cbnz w3, 5f - // The intended three edge rows left; output the one at h-2 and - // the past edge one at h. - ld1 {v22.4s, v23.4s}, [x5], x7 - ld1 {v29.8h}, [x6], x8 - // Pad the past-edge row from the last content row. - mov v24.16b, v22.16b - mov v25.16b, v23.16b - mov v30.16b, v29.16b - add5 - shift2 - add x0, x0, x7 - add x1, x1, x8 - // The last two rows are already padded properly here. - add5 - b 6f - -5: - // w3 == -1, two rows left, output one. - // Pad the last two rows from the mid one. - mov v22.16b, v20.16b - mov v23.16b, v21.16b - mov v29.16b, v28.16b - mov v24.16b, v20.16b - mov v25.16b, v21.16b - mov v30.16b, v28.16b - add5 - add x0, x0, x7 - add x1, x1, x8 - b 6f - -6: // End of one vertical slice. - subs w2, w2, #8 - b.le 0f - // Move pointers back up to the top and loop horizontally. - // Input pointers - msub x5, x7, x11, x5 - msub x6, x8, x11, x6 - // Output pointers - msub x0, x7, x10, x0 - msub x1, x8, x10, x1 - add x0, x0, #32 - add x1, x1, #16 - add x5, x5, #32 - add x6, x6, #16 - mov w3, w9 - b 1b - -0: - ret -.purgem add5 -endfunc - -// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, -// const int w, const int h, const int strength); -// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, -// const int w, const int h, const int strength); -function sgr_calc_ab1_neon, export=1 - add x3, x3, #2 // h += 2 - movi v31.4s, #9 // n - mov x5, #455 - mov x8, #SUM_STRIDE - b sgr_calc_ab_neon -endfunc - -function sgr_calc_ab2_neon, export=1 - add x3, x3, #3 // h += 3 - asr x3, x3, #1 // h /= 2 - movi v31.4s, #25 // n - mov x5, #164 - mov x8, #(2*SUM_STRIDE) -endfunc - -function sgr_calc_ab_neon - movrel x12, X(sgr_x_by_x) - ld1 {v16.16b, v17.16b, v18.16b}, [x12] - movi v19.16b, #5 - movi v20.8b, #55 // idx of last 5 - movi v21.8b, #72 // idx of last 4 - movi v22.8b, #101 // idx of last 3 - movi v23.8b, #169 // idx of last 2 - movi v24.8b, #254 // idx of last 1 - add x2, x2, #2 // w += 2 - add x7, x2, #7 - bic x7, x7, #7 // aligned w - sub x7, x8, x7 // increment between rows - movi v29.8h, #1, lsl #8 - dup v28.4s, w4 - dup v30.4s, w5 // one_by_x - sub x0, x0, #(4*(SUM_STRIDE)) - sub x1, x1, #(2*(SUM_STRIDE)) - mov x6, x2 // backup of w - sub v16.16b, v16.16b, v19.16b - sub v17.16b, v17.16b, v19.16b - sub v18.16b, v18.16b, v19.16b -1: - subs x2, x2, #8 - ld1 {v0.4s, v1.4s}, [x0] // a - ld1 {v2.8h}, [x1] // b - mul v0.4s, v0.4s, v31.4s // a * n - mul v1.4s, v1.4s, v31.4s // a * n - umull v3.4s, v2.4h, v2.4h // b * b - umull2 v4.4s, v2.8h, v2.8h // b * b - uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0) - uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0) - mul v0.4s, v0.4s, v28.4s // p * s - mul v1.4s, v1.4s, v28.4s // p * s - uqshrn v0.4h, v0.4s, #16 - uqshrn2 v0.8h, v1.4s, #16 - uqrshrn v0.8b, v0.8h, #4 // imin(z, 255) - - cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5 - cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4 - tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b - cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3 - cmhi v5.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2 - add v25.8b, v25.8b, v26.8b - cmhi v6.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1 - add v27.8b, v27.8b, v5.8b - add v6.8b, v6.8b, v19.8b - add v25.8b, v25.8b, v27.8b - add v1.8b, v1.8b, v6.8b - add v1.8b, v1.8b, v25.8b - uxtl v1.8h, v1.8b // x - - umull v3.4s, v1.4h, v2.4h // x * BB[i] - umull2 v4.4s, v1.8h, v2.8h // x * BB[i] - mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x - mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x - srshr v3.4s, v3.4s, #12 // AA[i] - srshr v4.4s, v4.4s, #12 // AA[i] - sub v2.8h, v29.8h, v1.8h // 256 - x - - st1 {v3.4s, v4.4s}, [x0], #32 - st1 {v2.8h}, [x1], #16 - b.gt 1b - - subs x3, x3, #1 - b.le 0f - add x0, x0, x7, lsl #2 - add x1, x1, x7, lsl #1 - mov x2, x6 - b 1b -0: - ret -endfunc - -#define FILTER_OUT_STRIDE 384 - -// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp, -// const pixel *src, const ptrdiff_t stride, -// const int32_t *a, const int16_t *b, -// const int w, const int h); -function sgr_finish_filter1_8bpc_neon, export=1 - sub x7, x3, #(4*SUM_STRIDE) - add x8, x3, #(4*SUM_STRIDE) - sub x9, x4, #(2*SUM_STRIDE) - add x10, x4, #(2*SUM_STRIDE) - mov x11, #SUM_STRIDE - mov x12, #FILTER_OUT_STRIDE - add x13, x5, #7 - bic x13, x13, #7 // Aligned width - sub x2, x2, x13 - sub x12, x12, x13 - sub x11, x11, x13 - sub x11, x11, #4 // We read 4 extra elements from a - sub x14, x11, #4 // We read 8 extra elements from b - mov x13, x5 - movi v6.8h, #3 - movi v7.4s, #3 -1: - ld1 {v0.8h, v1.8h}, [x9], #32 - ld1 {v2.8h, v3.8h}, [x4], #32 - ld1 {v4.8h, v5.8h}, [x10], #32 - ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48 - ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48 - ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48 - -2: - subs x5, x5, #8 - ext v25.16b, v0.16b, v1.16b, #2 // -stride - ext v26.16b, v2.16b, v3.16b, #2 // 0 - ext v27.16b, v4.16b, v5.16b, #2 // +stride - ext v28.16b, v0.16b, v1.16b, #4 // +1-stride - ext v29.16b, v2.16b, v3.16b, #4 // +1 - ext v30.16b, v4.16b, v5.16b, #4 // +1+stride - add v2.8h, v2.8h, v25.8h // -1, -stride - add v26.8h, v26.8h, v27.8h // 0, +stride - add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride - add v2.8h, v2.8h, v26.8h - add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride - add v2.8h, v2.8h, v29.8h // +1 - add v0.8h, v0.8h, v4.8h - - ext v25.16b, v16.16b, v17.16b, #4 // -stride - ext v26.16b, v17.16b, v18.16b, #4 - shl v2.8h, v2.8h, #2 - ext v27.16b, v16.16b, v17.16b, #8 // +1-stride - ext v28.16b, v17.16b, v18.16b, #8 - ext v29.16b, v19.16b, v20.16b, #4 // 0 - ext v30.16b, v20.16b, v21.16b, #4 - mla v2.8h, v0.8h, v6.8h // * 3 -> a - add v25.4s, v25.4s, v19.4s // -stride, -1 - add v26.4s, v26.4s, v20.4s - add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride - add v17.4s, v17.4s, v28.4s - ext v27.16b, v19.16b, v20.16b, #8 // +1 - ext v28.16b, v20.16b, v21.16b, #8 - add v16.4s, v16.4s, v22.4s // -1+stride - add v17.4s, v17.4s, v23.4s - add v29.4s, v29.4s, v27.4s // 0, +1 - add v30.4s, v30.4s, v28.4s - add v25.4s, v25.4s, v29.4s - add v26.4s, v26.4s, v30.4s - ext v27.16b, v22.16b, v23.16b, #4 // +stride - ext v28.16b, v23.16b, v24.16b, #4 - ext v29.16b, v22.16b, v23.16b, #8 // +1+stride - ext v30.16b, v23.16b, v24.16b, #8 - ld1 {v19.8b}, [x1], #8 // src - add v25.4s, v25.4s, v27.4s // +stride - add v26.4s, v26.4s, v28.4s - add v16.4s, v16.4s, v29.4s // +1+stride - add v17.4s, v17.4s, v30.4s - shl v25.4s, v25.4s, #2 - shl v26.4s, v26.4s, #2 - mla v25.4s, v16.4s, v7.4s // * 3 -> b - mla v26.4s, v17.4s, v7.4s - uxtl v19.8h, v19.8b // src - mov v0.16b, v1.16b - umlal v25.4s, v2.4h, v19.4h // b + a * src - umlal2 v26.4s, v2.8h, v19.8h - mov v2.16b, v3.16b - rshrn v25.4h, v25.4s, #9 - rshrn2 v25.8h, v26.4s, #9 - mov v4.16b, v5.16b - st1 {v25.8h}, [x0], #16 - - b.le 3f - mov v16.16b, v18.16b - mov v19.16b, v21.16b - mov v22.16b, v24.16b - ld1 {v1.8h}, [x9], #16 - ld1 {v3.8h}, [x4], #16 - ld1 {v5.8h}, [x10], #16 - ld1 {v17.4s, v18.4s}, [x7], #32 - ld1 {v20.4s, v21.4s}, [x3], #32 - ld1 {v23.4s, v24.4s}, [x8], #32 - b 2b - -3: - subs x6, x6, #1 - b.le 0f - mov x5, x13 - add x0, x0, x12, lsl #1 - add x1, x1, x2 - add x3, x3, x11, lsl #2 - add x7, x7, x11, lsl #2 - add x8, x8, x11, lsl #2 - add x4, x4, x14, lsl #1 - add x9, x9, x14, lsl #1 - add x10, x10, x14, lsl #1 - b 1b -0: - ret -endfunc - -// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp, -// const pixel *src, const ptrdiff_t stride, -// const int32_t *a, const int16_t *b, -// const int w, const int h); -function sgr_finish_filter2_8bpc_neon, export=1 - add x7, x3, #(4*(SUM_STRIDE)) - sub x3, x3, #(4*(SUM_STRIDE)) - add x8, x4, #(2*(SUM_STRIDE)) - sub x4, x4, #(2*(SUM_STRIDE)) - mov x9, #(2*SUM_STRIDE) - mov x10, #FILTER_OUT_STRIDE - add x11, x5, #7 - bic x11, x11, #7 // Aligned width - sub x2, x2, x11 - sub x10, x10, x11 - sub x9, x9, x11 - sub x9, x9, #4 // We read 4 extra elements from a - sub x12, x9, #4 // We read 8 extra elements from b - mov x11, x5 - movi v4.8h, #5 - movi v5.4s, #5 - movi v6.8h, #6 - movi v7.4s, #6 -1: - ld1 {v0.8h, v1.8h}, [x4], #32 - ld1 {v2.8h, v3.8h}, [x8], #32 - ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48 - ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48 - -2: - subs x5, x5, #8 - ext v24.16b, v0.16b, v1.16b, #4 // +1-stride - ext v25.16b, v2.16b, v3.16b, #4 // +1+stride - ext v22.16b, v0.16b, v1.16b, #2 // -stride - ext v23.16b, v2.16b, v3.16b, #2 // +stride - add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride - add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride - add v2.8h, v22.8h, v23.8h // -stride, +stride - add v0.8h, v0.8h, v25.8h - - ext v22.16b, v16.16b, v17.16b, #4 // -stride - ext v23.16b, v17.16b, v18.16b, #4 - ext v24.16b, v19.16b, v20.16b, #4 // +stride - ext v25.16b, v20.16b, v21.16b, #4 - ext v26.16b, v16.16b, v17.16b, #8 // +1-stride - ext v27.16b, v17.16b, v18.16b, #8 - ext v28.16b, v19.16b, v20.16b, #8 // +1+stride - ext v29.16b, v20.16b, v21.16b, #8 - mul v0.8h, v0.8h, v4.8h // * 5 - mla v0.8h, v2.8h, v6.8h // * 6 - ld1 {v31.8b}, [x1], #8 - add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride - add v17.4s, v17.4s, v27.4s - add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride - add v20.4s, v20.4s, v29.4s - add v16.4s, v16.4s, v19.4s - add v17.4s, v17.4s, v20.4s - - add v22.4s, v22.4s, v24.4s // -stride, +stride - add v23.4s, v23.4s, v25.4s - // This is, surprisingly, faster than other variants where the - // mul+mla pairs are further apart, on Cortex A53. - mul v16.4s, v16.4s, v5.4s // * 5 - mla v16.4s, v22.4s, v7.4s // * 6 - mul v17.4s, v17.4s, v5.4s // * 5 - mla v17.4s, v23.4s, v7.4s // * 6 - - uxtl v31.8h, v31.8b - umlal v16.4s, v0.4h, v31.4h // b + a * src - umlal2 v17.4s, v0.8h, v31.8h - mov v0.16b, v1.16b - rshrn v16.4h, v16.4s, #9 - rshrn2 v16.8h, v17.4s, #9 - mov v2.16b, v3.16b - st1 {v16.8h}, [x0], #16 - - b.le 3f - mov v16.16b, v18.16b - mov v19.16b, v21.16b - ld1 {v1.8h}, [x4], #16 - ld1 {v3.8h}, [x8], #16 - ld1 {v17.4s, v18.4s}, [x3], #32 - ld1 {v20.4s, v21.4s}, [x7], #32 - b 2b - -3: - subs x6, x6, #1 - b.le 0f - mov x5, x11 - add x0, x0, x10, lsl #1 - add x1, x1, x2 - add x3, x3, x9, lsl #2 - add x7, x7, x9, lsl #2 - add x4, x4, x12, lsl #1 - add x8, x8, x12, lsl #1 - mov x13, x3 - mov x14, x4 - - ld1 {v0.8h, v1.8h}, [x4], #32 - ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48 - -4: - subs x5, x5, #8 - ext v23.16b, v0.16b, v1.16b, #4 // +1 - ext v22.16b, v0.16b, v1.16b, #2 // 0 - add v0.8h, v0.8h, v23.8h // -1, +1 - - ext v24.16b, v16.16b, v17.16b, #4 // 0 - ext v25.16b, v17.16b, v18.16b, #4 - ext v26.16b, v16.16b, v17.16b, #8 // +1 - ext v27.16b, v17.16b, v18.16b, #8 - mul v2.8h, v22.8h, v6.8h // * 6 - mla v2.8h, v0.8h, v4.8h // * 5 -> a - ld1 {v31.8b}, [x1], #8 - add v16.4s, v16.4s, v26.4s // -1, +1 - add v17.4s, v17.4s, v27.4s - uxtl v31.8h, v31.8b - // This is, surprisingly, faster than other variants where the - // mul+mla pairs are further apart, on Cortex A53. - mul v24.4s, v24.4s, v7.4s // * 6 - mla v24.4s, v16.4s, v5.4s // * 5 -> b - mul v25.4s, v25.4s, v7.4s // * 6 - mla v25.4s, v17.4s, v5.4s // * 5 -> b - - umlal v24.4s, v2.4h, v31.4h // b + a * src - umlal2 v25.4s, v2.8h, v31.8h - mov v0.16b, v1.16b - rshrn v24.4h, v24.4s, #8 - rshrn2 v24.8h, v25.4s, #8 - mov v16.16b, v18.16b - st1 {v24.8h}, [x0], #16 - - b.le 5f - ld1 {v1.8h}, [x4], #16 - ld1 {v17.4s, v18.4s}, [x3], #32 - b 4b - -5: - subs x6, x6, #1 - b.le 0f - mov x5, x11 - add x0, x0, x10, lsl #1 - add x1, x1, x2 - mov x3, x13 // Rewind x3/x4 to where they started - mov x4, x14 - b 1b -0: - ret -endfunc - -// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride, -// const pixel *src, const ptrdiff_t src_stride, -// const int16_t *t1, const int w, const int h, -// const int wt); -function sgr_weighted1_8bpc_neon, export=1 - dup v31.8h, w7 - cmp x6, #2 - add x9, x0, x1 - add x10, x2, x3 - add x11, x4, #2*FILTER_OUT_STRIDE - mov x7, #(4*FILTER_OUT_STRIDE) - lsl x1, x1, #1 - lsl x3, x3, #1 - add x8, x5, #7 - bic x8, x8, #7 // Aligned width - sub x1, x1, x8 - sub x3, x3, x8 - sub x7, x7, x8, lsl #1 - mov x8, x5 - b.lt 2f -1: - ld1 {v0.8b}, [x2], #8 - ld1 {v4.8b}, [x10], #8 - ld1 {v1.8h}, [x4], #16 - ld1 {v5.8h}, [x11], #16 - subs x5, x5, #8 - ushll v0.8h, v0.8b, #4 // u - ushll v4.8h, v4.8b, #4 // u - sub v1.8h, v1.8h, v0.8h // t1 - u - sub v5.8h, v5.8h, v4.8h // t1 - u - ushll v2.4s, v0.4h, #7 // u << 7 - ushll2 v3.4s, v0.8h, #7 // u << 7 - ushll v6.4s, v4.4h, #7 // u << 7 - ushll2 v7.4s, v4.8h, #7 // u << 7 - smlal v2.4s, v1.4h, v31.4h // v - smlal2 v3.4s, v1.8h, v31.8h // v - smlal v6.4s, v5.4h, v31.4h // v - smlal2 v7.4s, v5.8h, v31.8h // v - rshrn v2.4h, v2.4s, #11 - rshrn2 v2.8h, v3.4s, #11 - rshrn v6.4h, v6.4s, #11 - rshrn2 v6.8h, v7.4s, #11 - sqxtun v2.8b, v2.8h - sqxtun v6.8b, v6.8h - st1 {v2.8b}, [x0], #8 - st1 {v6.8b}, [x9], #8 - b.gt 1b - - sub x6, x6, #2 - cmp x6, #1 - b.lt 0f - mov x5, x8 - add x0, x0, x1 - add x9, x9, x1 - add x2, x2, x3 - add x10, x10, x3 - add x4, x4, x7 - add x11, x11, x7 - b.eq 2f - b 1b - -2: - ld1 {v0.8b}, [x2], #8 - ld1 {v1.8h}, [x4], #16 - subs x5, x5, #8 - ushll v0.8h, v0.8b, #4 // u - sub v1.8h, v1.8h, v0.8h // t1 - u - ushll v2.4s, v0.4h, #7 // u << 7 - ushll2 v3.4s, v0.8h, #7 // u << 7 - smlal v2.4s, v1.4h, v31.4h // v - smlal2 v3.4s, v1.8h, v31.8h // v - rshrn v2.4h, v2.4s, #11 - rshrn2 v2.8h, v3.4s, #11 - sqxtun v2.8b, v2.8h - st1 {v2.8b}, [x0], #8 - b.gt 2b -0: - ret -endfunc - -// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *src, const ptrdiff_t src_stride, -// const int16_t *t1, const int16_t *t2, -// const int w, const int h, -// const int16_t wt[2]); -function sgr_weighted2_8bpc_neon, export=1 - ldr x8, [sp] - cmp x7, #2 - add x10, x0, x1 - add x11, x2, x3 - add x12, x4, #2*FILTER_OUT_STRIDE - add x13, x5, #2*FILTER_OUT_STRIDE - ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1] - mov x8, #4*FILTER_OUT_STRIDE - lsl x1, x1, #1 - lsl x3, x3, #1 - add x9, x6, #7 - bic x9, x9, #7 // Aligned width - sub x1, x1, x9 - sub x3, x3, x9 - sub x8, x8, x9, lsl #1 - mov x9, x6 - b.lt 2f -1: - ld1 {v0.8b}, [x2], #8 - ld1 {v16.8b}, [x11], #8 - ld1 {v1.8h}, [x4], #16 - ld1 {v17.8h}, [x12], #16 - ld1 {v2.8h}, [x5], #16 - ld1 {v18.8h}, [x13], #16 - subs x6, x6, #8 - ushll v0.8h, v0.8b, #4 // u - ushll v16.8h, v16.8b, #4 // u - sub v1.8h, v1.8h, v0.8h // t1 - u - sub v2.8h, v2.8h, v0.8h // t2 - u - sub v17.8h, v17.8h, v16.8h // t1 - u - sub v18.8h, v18.8h, v16.8h // t2 - u - ushll v3.4s, v0.4h, #7 // u << 7 - ushll2 v4.4s, v0.8h, #7 // u << 7 - ushll v19.4s, v16.4h, #7 // u << 7 - ushll2 v20.4s, v16.8h, #7 // u << 7 - smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u) - smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u) - smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u) - smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u) - smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u) - smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u) - smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u) - smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u) - rshrn v3.4h, v3.4s, #11 - rshrn2 v3.8h, v4.4s, #11 - rshrn v19.4h, v19.4s, #11 - rshrn2 v19.8h, v20.4s, #11 - sqxtun v3.8b, v3.8h - sqxtun v19.8b, v19.8h - st1 {v3.8b}, [x0], #8 - st1 {v19.8b}, [x10], #8 - b.gt 1b - - subs x7, x7, #2 - cmp x7, #1 - b.lt 0f - mov x6, x9 - add x0, x0, x1 - add x10, x10, x1 - add x2, x2, x3 - add x11, x11, x3 - add x4, x4, x8 - add x12, x12, x8 - add x5, x5, x8 - add x13, x13, x8 - b.eq 2f - b 1b - -2: - ld1 {v0.8b}, [x2], #8 - ld1 {v1.8h}, [x4], #16 - ld1 {v2.8h}, [x5], #16 - subs x6, x6, #8 - ushll v0.8h, v0.8b, #4 // u - sub v1.8h, v1.8h, v0.8h // t1 - u - sub v2.8h, v2.8h, v0.8h // t2 - u - ushll v3.4s, v0.4h, #7 // u << 7 - ushll2 v4.4s, v0.8h, #7 // u << 7 - smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u) - smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u) - smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u) - smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u) - rshrn v3.4h, v3.4s, #11 - rshrn2 v3.8h, v4.4s, #11 - sqxtun v3.8b, v3.8h - st1 {v3.8b}, [x0], #8 - b.gt 1b -0: - ret -endfunc diff --git a/src/arm/64/looprestoration_common.S b/src/arm/64/looprestoration_common.S new file mode 100644 index 0000000..dc07827 --- /dev/null +++ b/src/arm/64/looprestoration_common.S @@ -0,0 +1,422 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define SUM_STRIDE (384+16) + +// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_v_neon, export=1 + add w10, w3, #2 // Number of output rows to move back + mov w11, w3 // Number of input rows to move back + add w2, w2, #2 // Actual summed width + mov x7, #(4*SUM_STRIDE) // sumsq stride + mov x8, #(2*SUM_STRIDE) // sum stride + sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride + sub x1, x1, #(2*SUM_STRIDE) // sum -= stride + + tst w4, #4 // LR_HAVE_TOP + b.eq 0f + // If have top, read from row -2. + sub x5, x0, #(4*SUM_STRIDE) + sub x6, x1, #(2*SUM_STRIDE) + add w11, w11, #2 + b 1f +0: + // !LR_HAVE_TOP + // If we don't have top, read from row 0 even if + // we start writing to row -1. + add x5, x0, #(4*SUM_STRIDE) + add x6, x1, #(2*SUM_STRIDE) +1: + + tst w4, #8 // LR_HAVE_BOTTOM + b.eq 1f + // LR_HAVE_BOTTOM + add w3, w3, #2 // Sum all h+2 lines with the main loop + add w11, w11, #2 +1: + mov w9, w3 // Backup of h for next loops + +1: + // Start of horizontal loop; start one vertical filter slice. + // Start loading rows into v16-v21 and v24-v26 taking top + // padding into consideration. + tst w4, #4 // LR_HAVE_TOP + ld1 {v16.4s, v17.4s}, [x5], x7 + ld1 {v24.8h}, [x6], x8 + b.eq 2f + // LR_HAVE_TOP + ld1 {v18.4s, v19.4s}, [x5], x7 + ld1 {v25.8h}, [x6], x8 + ld1 {v20.4s, v21.4s}, [x5], x7 + ld1 {v26.8h}, [x6], x8 + b 3f +2: // !LR_HAVE_TOP + mov v18.16b, v16.16b + mov v19.16b, v17.16b + mov v25.16b, v24.16b + mov v20.16b, v16.16b + mov v21.16b, v17.16b + mov v26.16b, v24.16b + +3: + subs w3, w3, #1 +.macro add3 + add v16.4s, v16.4s, v18.4s + add v17.4s, v17.4s, v19.4s + add v24.8h, v24.8h, v25.8h + add v16.4s, v16.4s, v20.4s + add v17.4s, v17.4s, v21.4s + add v24.8h, v24.8h, v26.8h + st1 {v16.4s, v17.4s}, [x0], x7 + st1 {v24.8h}, [x1], x8 +.endm + add3 + mov v16.16b, v18.16b + mov v17.16b, v19.16b + mov v24.16b, v25.16b + mov v18.16b, v20.16b + mov v19.16b, v21.16b + mov v25.16b, v26.16b + b.le 4f + ld1 {v20.4s, v21.4s}, [x5], x7 + ld1 {v26.8h}, [x6], x8 + b 3b + +4: + tst w4, #8 // LR_HAVE_BOTTOM + b.ne 5f + // !LR_HAVE_BOTTOM + // Produce two more rows, extending the already loaded rows. + add3 + mov v16.16b, v18.16b + mov v17.16b, v19.16b + mov v24.16b, v25.16b + add3 + +5: // End of one vertical slice. + subs w2, w2, #8 + b.le 0f + // Move pointers back up to the top and loop horizontally. + // Input pointers + msub x5, x7, x11, x5 + msub x6, x8, x11, x6 + // Output pointers + msub x0, x7, x10, x0 + msub x1, x8, x10, x1 + add x0, x0, #32 + add x1, x1, #16 + add x5, x5, #32 + add x6, x6, #16 + mov w3, w9 + b 1b + +0: + ret +.purgem add3 +endfunc + +// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_v_neon, export=1 + add w10, w3, #2 // Number of output rows to move back + mov w11, w3 // Number of input rows to move back + add w2, w2, #8 // Actual summed width + mov x7, #(4*SUM_STRIDE) // sumsq stride + mov x8, #(2*SUM_STRIDE) // sum stride + sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride + sub x1, x1, #(2*SUM_STRIDE) // sum -= stride + + tst w4, #4 // LR_HAVE_TOP + b.eq 0f + // If have top, read from row -2. + sub x5, x0, #(4*SUM_STRIDE) + sub x6, x1, #(2*SUM_STRIDE) + add w11, w11, #2 + b 1f +0: + // !LR_HAVE_TOP + // If we don't have top, read from row 0 even if + // we start writing to row -1. + add x5, x0, #(4*SUM_STRIDE) + add x6, x1, #(2*SUM_STRIDE) +1: + + tst w4, #8 // LR_HAVE_BOTTOM + b.eq 0f + // LR_HAVE_BOTTOM + add w3, w3, #2 // Handle h+2 lines with the main loop + add w11, w11, #2 + b 1f +0: + // !LR_HAVE_BOTTOM + sub w3, w3, #1 // Handle h-1 lines with the main loop +1: + mov w9, w3 // Backup of h for next loops + +1: + // Start of horizontal loop; start one vertical filter slice. + // Start loading rows into v16-v25 and v26-v30 taking top + // padding into consideration. + tst w4, #4 // LR_HAVE_TOP + ld1 {v16.4s, v17.4s}, [x5], x7 + ld1 {v26.8h}, [x6], x8 + b.eq 2f + // LR_HAVE_TOP + ld1 {v20.4s, v21.4s}, [x5], x7 + ld1 {v28.8h}, [x6], x8 + mov v18.16b, v16.16b + mov v19.16b, v17.16b + mov v27.16b, v26.16b + ld1 {v22.4s, v23.4s}, [x5], x7 + ld1 {v29.8h}, [x6], x8 + b 3f +2: // !LR_HAVE_TOP + mov v18.16b, v16.16b + mov v19.16b, v17.16b + mov v27.16b, v26.16b + mov v20.16b, v16.16b + mov v21.16b, v17.16b + mov v28.16b, v26.16b + mov v22.16b, v16.16b + mov v23.16b, v17.16b + mov v29.16b, v26.16b + +3: + cbz w3, 4f + ld1 {v24.4s, v25.4s}, [x5], x7 + ld1 {v30.8h}, [x6], x8 + +3: + // Start of vertical loop + subs w3, w3, #2 +.macro add5 + add v16.4s, v16.4s, v18.4s + add v17.4s, v17.4s, v19.4s + add v26.8h, v26.8h, v27.8h + add v0.4s, v20.4s, v22.4s + add v1.4s, v21.4s, v23.4s + add v2.8h, v28.8h, v29.8h + add v16.4s, v16.4s, v24.4s + add v17.4s, v17.4s, v25.4s + add v26.8h, v26.8h, v30.8h + add v16.4s, v16.4s, v0.4s + add v17.4s, v17.4s, v1.4s + add v26.8h, v26.8h, v2.8h + st1 {v16.4s, v17.4s}, [x0], x7 + st1 {v26.8h}, [x1], x8 +.endm + add5 +.macro shift2 + mov v16.16b, v20.16b + mov v17.16b, v21.16b + mov v26.16b, v28.16b + mov v18.16b, v22.16b + mov v19.16b, v23.16b + mov v27.16b, v29.16b + mov v20.16b, v24.16b + mov v21.16b, v25.16b + mov v28.16b, v30.16b +.endm + shift2 + add x0, x0, x7 + add x1, x1, x8 + b.le 5f + ld1 {v22.4s, v23.4s}, [x5], x7 + ld1 {v29.8h}, [x6], x8 + ld1 {v24.4s, v25.4s}, [x5], x7 + ld1 {v30.8h}, [x6], x8 + b 3b + +4: + // h == 1, !LR_HAVE_BOTTOM. + // Pad the last row with the only content row, and add. + mov v24.16b, v22.16b + mov v25.16b, v23.16b + mov v30.16b, v29.16b + add5 + shift2 + add x0, x0, x7 + add x1, x1, x8 + add5 + b 6f + +5: + tst w4, #8 // LR_HAVE_BOTTOM + b.ne 6f + // !LR_HAVE_BOTTOM + cbnz w3, 5f + // The intended three edge rows left; output the one at h-2 and + // the past edge one at h. + ld1 {v22.4s, v23.4s}, [x5], x7 + ld1 {v29.8h}, [x6], x8 + // Pad the past-edge row from the last content row. + mov v24.16b, v22.16b + mov v25.16b, v23.16b + mov v30.16b, v29.16b + add5 + shift2 + add x0, x0, x7 + add x1, x1, x8 + // The last two rows are already padded properly here. + add5 + b 6f + +5: + // w3 == -1, two rows left, output one. + // Pad the last two rows from the mid one. + mov v22.16b, v20.16b + mov v23.16b, v21.16b + mov v29.16b, v28.16b + mov v24.16b, v20.16b + mov v25.16b, v21.16b + mov v30.16b, v28.16b + add5 + add x0, x0, x7 + add x1, x1, x8 + b 6f + +6: // End of one vertical slice. + subs w2, w2, #8 + b.le 0f + // Move pointers back up to the top and loop horizontally. + // Input pointers + msub x5, x7, x11, x5 + msub x6, x8, x11, x6 + // Output pointers + msub x0, x7, x10, x0 + msub x1, x8, x10, x1 + add x0, x0, #32 + add x1, x1, #16 + add x5, x5, #32 + add x6, x6, #16 + mov w3, w9 + b 1b + +0: + ret +.purgem add5 +endfunc + +// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, +// const int w, const int h, const int strength); +// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, +// const int w, const int h, const int strength); +function sgr_calc_ab1_neon, export=1 + add x3, x3, #2 // h += 2 + movi v31.4s, #9 // n + mov x5, #455 + mov x8, #SUM_STRIDE + b sgr_calc_ab_neon +endfunc + +function sgr_calc_ab2_neon, export=1 + add x3, x3, #3 // h += 3 + asr x3, x3, #1 // h /= 2 + movi v31.4s, #25 // n + mov x5, #164 + mov x8, #(2*SUM_STRIDE) +endfunc + +function sgr_calc_ab_neon + movrel x12, X(sgr_x_by_x) + ld1 {v16.16b, v17.16b, v18.16b}, [x12] + movi v19.16b, #5 + movi v20.8b, #55 // idx of last 5 + movi v21.8b, #72 // idx of last 4 + movi v22.8b, #101 // idx of last 3 + movi v23.8b, #169 // idx of last 2 + movi v24.8b, #254 // idx of last 1 + add x2, x2, #2 // w += 2 + add x7, x2, #7 + bic x7, x7, #7 // aligned w + sub x7, x8, x7 // increment between rows + movi v29.8h, #1, lsl #8 + dup v28.4s, w4 + dup v30.4s, w5 // one_by_x + sub x0, x0, #(4*(SUM_STRIDE)) + sub x1, x1, #(2*(SUM_STRIDE)) + mov x6, x2 // backup of w + sub v16.16b, v16.16b, v19.16b + sub v17.16b, v17.16b, v19.16b + sub v18.16b, v18.16b, v19.16b +1: + subs x2, x2, #8 + ld1 {v0.4s, v1.4s}, [x0] // a + ld1 {v2.8h}, [x1] // b + mul v0.4s, v0.4s, v31.4s // a * n + mul v1.4s, v1.4s, v31.4s // a * n + umull v3.4s, v2.4h, v2.4h // b * b + umull2 v4.4s, v2.8h, v2.8h // b * b + uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0) + uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0) + mul v0.4s, v0.4s, v28.4s // p * s + mul v1.4s, v1.4s, v28.4s // p * s + uqshrn v0.4h, v0.4s, #16 + uqshrn2 v0.8h, v1.4s, #16 + uqrshrn v0.8b, v0.8h, #4 // imin(z, 255) + + cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5 + cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4 + tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b + cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3 + cmhi v5.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2 + add v25.8b, v25.8b, v26.8b + cmhi v6.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1 + add v27.8b, v27.8b, v5.8b + add v6.8b, v6.8b, v19.8b + add v25.8b, v25.8b, v27.8b + add v1.8b, v1.8b, v6.8b + add v1.8b, v1.8b, v25.8b + uxtl v1.8h, v1.8b // x + + umull v3.4s, v1.4h, v2.4h // x * BB[i] + umull2 v4.4s, v1.8h, v2.8h // x * BB[i] + mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x + mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x + srshr v3.4s, v3.4s, #12 // AA[i] + srshr v4.4s, v4.4s, #12 // AA[i] + sub v2.8h, v29.8h, v1.8h // 256 - x + + st1 {v3.4s, v4.4s}, [x0], #32 + st1 {v2.8h}, [x1], #16 + b.gt 1b + + subs x3, x3, #1 + b.le 0f + add x0, x0, x7, lsl #2 + add x1, x1, x7, lsl #1 + mov x2, x6 + b 1b +0: + ret +endfunc diff --git a/src/arm/64/looprestoration_tmpl.S b/src/arm/64/looprestoration_tmpl.S new file mode 100644 index 0000000..27c952d --- /dev/null +++ b/src/arm/64/looprestoration_tmpl.S @@ -0,0 +1,474 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" + +#define FILTER_OUT_STRIDE 384 + +// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp, +// const pixel *src, const ptrdiff_t stride, +// const int32_t *a, const int16_t *b, +// const int w, const int h); +function sgr_finish_filter1_8bpc_neon, export=1 + sub x7, x3, #(4*SUM_STRIDE) + add x8, x3, #(4*SUM_STRIDE) + sub x9, x4, #(2*SUM_STRIDE) + add x10, x4, #(2*SUM_STRIDE) + mov x11, #SUM_STRIDE + mov x12, #FILTER_OUT_STRIDE + add x13, x5, #7 + bic x13, x13, #7 // Aligned width + sub x2, x2, x13 + sub x12, x12, x13 + sub x11, x11, x13 + sub x11, x11, #4 // We read 4 extra elements from a + sub x14, x11, #4 // We read 8 extra elements from b + mov x13, x5 + movi v6.8h, #3 + movi v7.4s, #3 +1: + ld1 {v0.8h, v1.8h}, [x9], #32 + ld1 {v2.8h, v3.8h}, [x4], #32 + ld1 {v4.8h, v5.8h}, [x10], #32 + ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48 + ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48 + ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48 + +2: + subs x5, x5, #8 + ext v25.16b, v0.16b, v1.16b, #2 // -stride + ext v26.16b, v2.16b, v3.16b, #2 // 0 + ext v27.16b, v4.16b, v5.16b, #2 // +stride + ext v28.16b, v0.16b, v1.16b, #4 // +1-stride + ext v29.16b, v2.16b, v3.16b, #4 // +1 + ext v30.16b, v4.16b, v5.16b, #4 // +1+stride + add v2.8h, v2.8h, v25.8h // -1, -stride + add v26.8h, v26.8h, v27.8h // 0, +stride + add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride + add v2.8h, v2.8h, v26.8h + add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride + add v2.8h, v2.8h, v29.8h // +1 + add v0.8h, v0.8h, v4.8h + + ext v25.16b, v16.16b, v17.16b, #4 // -stride + ext v26.16b, v17.16b, v18.16b, #4 + shl v2.8h, v2.8h, #2 + ext v27.16b, v16.16b, v17.16b, #8 // +1-stride + ext v28.16b, v17.16b, v18.16b, #8 + ext v29.16b, v19.16b, v20.16b, #4 // 0 + ext v30.16b, v20.16b, v21.16b, #4 + mla v2.8h, v0.8h, v6.8h // * 3 -> a + add v25.4s, v25.4s, v19.4s // -stride, -1 + add v26.4s, v26.4s, v20.4s + add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride + add v17.4s, v17.4s, v28.4s + ext v27.16b, v19.16b, v20.16b, #8 // +1 + ext v28.16b, v20.16b, v21.16b, #8 + add v16.4s, v16.4s, v22.4s // -1+stride + add v17.4s, v17.4s, v23.4s + add v29.4s, v29.4s, v27.4s // 0, +1 + add v30.4s, v30.4s, v28.4s + add v25.4s, v25.4s, v29.4s + add v26.4s, v26.4s, v30.4s + ext v27.16b, v22.16b, v23.16b, #4 // +stride + ext v28.16b, v23.16b, v24.16b, #4 + ext v29.16b, v22.16b, v23.16b, #8 // +1+stride + ext v30.16b, v23.16b, v24.16b, #8 + ld1 {v19.8b}, [x1], #8 // src + add v25.4s, v25.4s, v27.4s // +stride + add v26.4s, v26.4s, v28.4s + add v16.4s, v16.4s, v29.4s // +1+stride + add v17.4s, v17.4s, v30.4s + shl v25.4s, v25.4s, #2 + shl v26.4s, v26.4s, #2 + mla v25.4s, v16.4s, v7.4s // * 3 -> b + mla v26.4s, v17.4s, v7.4s + uxtl v19.8h, v19.8b // src + mov v0.16b, v1.16b + umlal v25.4s, v2.4h, v19.4h // b + a * src + umlal2 v26.4s, v2.8h, v19.8h + mov v2.16b, v3.16b + rshrn v25.4h, v25.4s, #9 + rshrn2 v25.8h, v26.4s, #9 + mov v4.16b, v5.16b + st1 {v25.8h}, [x0], #16 + + b.le 3f + mov v16.16b, v18.16b + mov v19.16b, v21.16b + mov v22.16b, v24.16b + ld1 {v1.8h}, [x9], #16 + ld1 {v3.8h}, [x4], #16 + ld1 {v5.8h}, [x10], #16 + ld1 {v17.4s, v18.4s}, [x7], #32 + ld1 {v20.4s, v21.4s}, [x3], #32 + ld1 {v23.4s, v24.4s}, [x8], #32 + b 2b + +3: + subs x6, x6, #1 + b.le 0f + mov x5, x13 + add x0, x0, x12, lsl #1 + add x1, x1, x2 + add x3, x3, x11, lsl #2 + add x7, x7, x11, lsl #2 + add x8, x8, x11, lsl #2 + add x4, x4, x14, lsl #1 + add x9, x9, x14, lsl #1 + add x10, x10, x14, lsl #1 + b 1b +0: + ret +endfunc + +// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp, +// const pixel *src, const ptrdiff_t stride, +// const int32_t *a, const int16_t *b, +// const int w, const int h); +function sgr_finish_filter2_8bpc_neon, export=1 + add x7, x3, #(4*(SUM_STRIDE)) + sub x3, x3, #(4*(SUM_STRIDE)) + add x8, x4, #(2*(SUM_STRIDE)) + sub x4, x4, #(2*(SUM_STRIDE)) + mov x9, #(2*SUM_STRIDE) + mov x10, #FILTER_OUT_STRIDE + add x11, x5, #7 + bic x11, x11, #7 // Aligned width + sub x2, x2, x11 + sub x10, x10, x11 + sub x9, x9, x11 + sub x9, x9, #4 // We read 4 extra elements from a + sub x12, x9, #4 // We read 8 extra elements from b + mov x11, x5 + movi v4.8h, #5 + movi v5.4s, #5 + movi v6.8h, #6 + movi v7.4s, #6 +1: + ld1 {v0.8h, v1.8h}, [x4], #32 + ld1 {v2.8h, v3.8h}, [x8], #32 + ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48 + ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48 + +2: + subs x5, x5, #8 + ext v24.16b, v0.16b, v1.16b, #4 // +1-stride + ext v25.16b, v2.16b, v3.16b, #4 // +1+stride + ext v22.16b, v0.16b, v1.16b, #2 // -stride + ext v23.16b, v2.16b, v3.16b, #2 // +stride + add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride + add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride + add v2.8h, v22.8h, v23.8h // -stride, +stride + add v0.8h, v0.8h, v25.8h + + ext v22.16b, v16.16b, v17.16b, #4 // -stride + ext v23.16b, v17.16b, v18.16b, #4 + ext v24.16b, v19.16b, v20.16b, #4 // +stride + ext v25.16b, v20.16b, v21.16b, #4 + ext v26.16b, v16.16b, v17.16b, #8 // +1-stride + ext v27.16b, v17.16b, v18.16b, #8 + ext v28.16b, v19.16b, v20.16b, #8 // +1+stride + ext v29.16b, v20.16b, v21.16b, #8 + mul v0.8h, v0.8h, v4.8h // * 5 + mla v0.8h, v2.8h, v6.8h // * 6 + ld1 {v31.8b}, [x1], #8 + add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride + add v17.4s, v17.4s, v27.4s + add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride + add v20.4s, v20.4s, v29.4s + add v16.4s, v16.4s, v19.4s + add v17.4s, v17.4s, v20.4s + + add v22.4s, v22.4s, v24.4s // -stride, +stride + add v23.4s, v23.4s, v25.4s + // This is, surprisingly, faster than other variants where the + // mul+mla pairs are further apart, on Cortex A53. + mul v16.4s, v16.4s, v5.4s // * 5 + mla v16.4s, v22.4s, v7.4s // * 6 + mul v17.4s, v17.4s, v5.4s // * 5 + mla v17.4s, v23.4s, v7.4s // * 6 + + uxtl v31.8h, v31.8b + umlal v16.4s, v0.4h, v31.4h // b + a * src + umlal2 v17.4s, v0.8h, v31.8h + mov v0.16b, v1.16b + rshrn v16.4h, v16.4s, #9 + rshrn2 v16.8h, v17.4s, #9 + mov v2.16b, v3.16b + st1 {v16.8h}, [x0], #16 + + b.le 3f + mov v16.16b, v18.16b + mov v19.16b, v21.16b + ld1 {v1.8h}, [x4], #16 + ld1 {v3.8h}, [x8], #16 + ld1 {v17.4s, v18.4s}, [x3], #32 + ld1 {v20.4s, v21.4s}, [x7], #32 + b 2b + +3: + subs x6, x6, #1 + b.le 0f + mov x5, x11 + add x0, x0, x10, lsl #1 + add x1, x1, x2 + add x3, x3, x9, lsl #2 + add x7, x7, x9, lsl #2 + add x4, x4, x12, lsl #1 + add x8, x8, x12, lsl #1 + mov x13, x3 + mov x14, x4 + + ld1 {v0.8h, v1.8h}, [x4], #32 + ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48 + +4: + subs x5, x5, #8 + ext v23.16b, v0.16b, v1.16b, #4 // +1 + ext v22.16b, v0.16b, v1.16b, #2 // 0 + add v0.8h, v0.8h, v23.8h // -1, +1 + + ext v24.16b, v16.16b, v17.16b, #4 // 0 + ext v25.16b, v17.16b, v18.16b, #4 + ext v26.16b, v16.16b, v17.16b, #8 // +1 + ext v27.16b, v17.16b, v18.16b, #8 + mul v2.8h, v22.8h, v6.8h // * 6 + mla v2.8h, v0.8h, v4.8h // * 5 -> a + ld1 {v31.8b}, [x1], #8 + add v16.4s, v16.4s, v26.4s // -1, +1 + add v17.4s, v17.4s, v27.4s + uxtl v31.8h, v31.8b + // This is, surprisingly, faster than other variants where the + // mul+mla pairs are further apart, on Cortex A53. + mul v24.4s, v24.4s, v7.4s // * 6 + mla v24.4s, v16.4s, v5.4s // * 5 -> b + mul v25.4s, v25.4s, v7.4s // * 6 + mla v25.4s, v17.4s, v5.4s // * 5 -> b + + umlal v24.4s, v2.4h, v31.4h // b + a * src + umlal2 v25.4s, v2.8h, v31.8h + mov v0.16b, v1.16b + rshrn v24.4h, v24.4s, #8 + rshrn2 v24.8h, v25.4s, #8 + mov v16.16b, v18.16b + st1 {v24.8h}, [x0], #16 + + b.le 5f + ld1 {v1.8h}, [x4], #16 + ld1 {v17.4s, v18.4s}, [x3], #32 + b 4b + +5: + subs x6, x6, #1 + b.le 0f + mov x5, x11 + add x0, x0, x10, lsl #1 + add x1, x1, x2 + mov x3, x13 // Rewind x3/x4 to where they started + mov x4, x14 + b 1b +0: + ret +endfunc + +// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *t1, const int w, const int h, +// const int wt); +function sgr_weighted1_8bpc_neon, export=1 + dup v31.8h, w7 + cmp x6, #2 + add x9, x0, x1 + add x10, x2, x3 + add x11, x4, #2*FILTER_OUT_STRIDE + mov x7, #(4*FILTER_OUT_STRIDE) + lsl x1, x1, #1 + lsl x3, x3, #1 + add x8, x5, #7 + bic x8, x8, #7 // Aligned width + sub x1, x1, x8 + sub x3, x3, x8 + sub x7, x7, x8, lsl #1 + mov x8, x5 + b.lt 2f +1: + ld1 {v0.8b}, [x2], #8 + ld1 {v4.8b}, [x10], #8 + ld1 {v1.8h}, [x4], #16 + ld1 {v5.8h}, [x11], #16 + subs x5, x5, #8 + ushll v0.8h, v0.8b, #4 // u + ushll v4.8h, v4.8b, #4 // u + sub v1.8h, v1.8h, v0.8h // t1 - u + sub v5.8h, v5.8h, v4.8h // t1 - u + ushll v2.4s, v0.4h, #7 // u << 7 + ushll2 v3.4s, v0.8h, #7 // u << 7 + ushll v6.4s, v4.4h, #7 // u << 7 + ushll2 v7.4s, v4.8h, #7 // u << 7 + smlal v2.4s, v1.4h, v31.4h // v + smlal2 v3.4s, v1.8h, v31.8h // v + smlal v6.4s, v5.4h, v31.4h // v + smlal2 v7.4s, v5.8h, v31.8h // v + rshrn v2.4h, v2.4s, #11 + rshrn2 v2.8h, v3.4s, #11 + rshrn v6.4h, v6.4s, #11 + rshrn2 v6.8h, v7.4s, #11 + sqxtun v2.8b, v2.8h + sqxtun v6.8b, v6.8h + st1 {v2.8b}, [x0], #8 + st1 {v6.8b}, [x9], #8 + b.gt 1b + + sub x6, x6, #2 + cmp x6, #1 + b.lt 0f + mov x5, x8 + add x0, x0, x1 + add x9, x9, x1 + add x2, x2, x3 + add x10, x10, x3 + add x4, x4, x7 + add x11, x11, x7 + b.eq 2f + b 1b + +2: + ld1 {v0.8b}, [x2], #8 + ld1 {v1.8h}, [x4], #16 + subs x5, x5, #8 + ushll v0.8h, v0.8b, #4 // u + sub v1.8h, v1.8h, v0.8h // t1 - u + ushll v2.4s, v0.4h, #7 // u << 7 + ushll2 v3.4s, v0.8h, #7 // u << 7 + smlal v2.4s, v1.4h, v31.4h // v + smlal2 v3.4s, v1.8h, v31.8h // v + rshrn v2.4h, v2.4s, #11 + rshrn2 v2.8h, v3.4s, #11 + sqxtun v2.8b, v2.8h + st1 {v2.8b}, [x0], #8 + b.gt 2b +0: + ret +endfunc + +// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *t1, const int16_t *t2, +// const int w, const int h, +// const int16_t wt[2]); +function sgr_weighted2_8bpc_neon, export=1 + ldr x8, [sp] + cmp x7, #2 + add x10, x0, x1 + add x11, x2, x3 + add x12, x4, #2*FILTER_OUT_STRIDE + add x13, x5, #2*FILTER_OUT_STRIDE + ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1] + mov x8, #4*FILTER_OUT_STRIDE + lsl x1, x1, #1 + lsl x3, x3, #1 + add x9, x6, #7 + bic x9, x9, #7 // Aligned width + sub x1, x1, x9 + sub x3, x3, x9 + sub x8, x8, x9, lsl #1 + mov x9, x6 + b.lt 2f +1: + ld1 {v0.8b}, [x2], #8 + ld1 {v16.8b}, [x11], #8 + ld1 {v1.8h}, [x4], #16 + ld1 {v17.8h}, [x12], #16 + ld1 {v2.8h}, [x5], #16 + ld1 {v18.8h}, [x13], #16 + subs x6, x6, #8 + ushll v0.8h, v0.8b, #4 // u + ushll v16.8h, v16.8b, #4 // u + sub v1.8h, v1.8h, v0.8h // t1 - u + sub v2.8h, v2.8h, v0.8h // t2 - u + sub v17.8h, v17.8h, v16.8h // t1 - u + sub v18.8h, v18.8h, v16.8h // t2 - u + ushll v3.4s, v0.4h, #7 // u << 7 + ushll2 v4.4s, v0.8h, #7 // u << 7 + ushll v19.4s, v16.4h, #7 // u << 7 + ushll2 v20.4s, v16.8h, #7 // u << 7 + smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u) + smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u) + smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u) + smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u) + smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u) + smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u) + smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u) + smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u) + rshrn v3.4h, v3.4s, #11 + rshrn2 v3.8h, v4.4s, #11 + rshrn v19.4h, v19.4s, #11 + rshrn2 v19.8h, v20.4s, #11 + sqxtun v3.8b, v3.8h + sqxtun v19.8b, v19.8h + st1 {v3.8b}, [x0], #8 + st1 {v19.8b}, [x10], #8 + b.gt 1b + + subs x7, x7, #2 + cmp x7, #1 + b.lt 0f + mov x6, x9 + add x0, x0, x1 + add x10, x10, x1 + add x2, x2, x3 + add x11, x11, x3 + add x4, x4, x8 + add x12, x12, x8 + add x5, x5, x8 + add x13, x13, x8 + b.eq 2f + b 1b + +2: + ld1 {v0.8b}, [x2], #8 + ld1 {v1.8h}, [x4], #16 + ld1 {v2.8h}, [x5], #16 + subs x6, x6, #8 + ushll v0.8h, v0.8b, #4 // u + sub v1.8h, v1.8h, v0.8h // t1 - u + sub v2.8h, v2.8h, v0.8h // t2 - u + ushll v3.4s, v0.4h, #7 // u << 7 + ushll2 v4.4s, v0.8h, #7 // u << 7 + smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u) + smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u) + smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u) + smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u) + rshrn v3.4h, v3.4s, #11 + rshrn2 v3.8h, v4.4s, #11 + sqxtun v3.8b, v3.8h + st1 {v3.8b}, [x0], #8 + b.gt 1b +0: + ret +endfunc |