Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/arm/64/looprestoration_common.S')
-rw-r--r--src/arm/64/looprestoration_common.S422
1 files changed, 422 insertions, 0 deletions
diff --git a/src/arm/64/looprestoration_common.S b/src/arm/64/looprestoration_common.S
new file mode 100644
index 0000000..dc07827
--- /dev/null
+++ b/src/arm/64/looprestoration_common.S
@@ -0,0 +1,422 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define SUM_STRIDE (384+16)
+
+// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box3_v_neon, export=1
+ add w10, w3, #2 // Number of output rows to move back
+ mov w11, w3 // Number of input rows to move back
+ add w2, w2, #2 // Actual summed width
+ mov x7, #(4*SUM_STRIDE) // sumsq stride
+ mov x8, #(2*SUM_STRIDE) // sum stride
+ sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
+ sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
+
+ tst w4, #4 // LR_HAVE_TOP
+ b.eq 0f
+ // If have top, read from row -2.
+ sub x5, x0, #(4*SUM_STRIDE)
+ sub x6, x1, #(2*SUM_STRIDE)
+ add w11, w11, #2
+ b 1f
+0:
+ // !LR_HAVE_TOP
+ // If we don't have top, read from row 0 even if
+ // we start writing to row -1.
+ add x5, x0, #(4*SUM_STRIDE)
+ add x6, x1, #(2*SUM_STRIDE)
+1:
+
+ tst w4, #8 // LR_HAVE_BOTTOM
+ b.eq 1f
+ // LR_HAVE_BOTTOM
+ add w3, w3, #2 // Sum all h+2 lines with the main loop
+ add w11, w11, #2
+1:
+ mov w9, w3 // Backup of h for next loops
+
+1:
+ // Start of horizontal loop; start one vertical filter slice.
+ // Start loading rows into v16-v21 and v24-v26 taking top
+ // padding into consideration.
+ tst w4, #4 // LR_HAVE_TOP
+ ld1 {v16.4s, v17.4s}, [x5], x7
+ ld1 {v24.8h}, [x6], x8
+ b.eq 2f
+ // LR_HAVE_TOP
+ ld1 {v18.4s, v19.4s}, [x5], x7
+ ld1 {v25.8h}, [x6], x8
+ ld1 {v20.4s, v21.4s}, [x5], x7
+ ld1 {v26.8h}, [x6], x8
+ b 3f
+2: // !LR_HAVE_TOP
+ mov v18.16b, v16.16b
+ mov v19.16b, v17.16b
+ mov v25.16b, v24.16b
+ mov v20.16b, v16.16b
+ mov v21.16b, v17.16b
+ mov v26.16b, v24.16b
+
+3:
+ subs w3, w3, #1
+.macro add3
+ add v16.4s, v16.4s, v18.4s
+ add v17.4s, v17.4s, v19.4s
+ add v24.8h, v24.8h, v25.8h
+ add v16.4s, v16.4s, v20.4s
+ add v17.4s, v17.4s, v21.4s
+ add v24.8h, v24.8h, v26.8h
+ st1 {v16.4s, v17.4s}, [x0], x7
+ st1 {v24.8h}, [x1], x8
+.endm
+ add3
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v24.16b, v25.16b
+ mov v18.16b, v20.16b
+ mov v19.16b, v21.16b
+ mov v25.16b, v26.16b
+ b.le 4f
+ ld1 {v20.4s, v21.4s}, [x5], x7
+ ld1 {v26.8h}, [x6], x8
+ b 3b
+
+4:
+ tst w4, #8 // LR_HAVE_BOTTOM
+ b.ne 5f
+ // !LR_HAVE_BOTTOM
+ // Produce two more rows, extending the already loaded rows.
+ add3
+ mov v16.16b, v18.16b
+ mov v17.16b, v19.16b
+ mov v24.16b, v25.16b
+ add3
+
+5: // End of one vertical slice.
+ subs w2, w2, #8
+ b.le 0f
+ // Move pointers back up to the top and loop horizontally.
+ // Input pointers
+ msub x5, x7, x11, x5
+ msub x6, x8, x11, x6
+ // Output pointers
+ msub x0, x7, x10, x0
+ msub x1, x8, x10, x1
+ add x0, x0, #32
+ add x1, x1, #16
+ add x5, x5, #32
+ add x6, x6, #16
+ mov w3, w9
+ b 1b
+
+0:
+ ret
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+// const int w, const int h,
+// const enum LrEdgeFlags edges);
+function sgr_box5_v_neon, export=1
+ add w10, w3, #2 // Number of output rows to move back
+ mov w11, w3 // Number of input rows to move back
+ add w2, w2, #8 // Actual summed width
+ mov x7, #(4*SUM_STRIDE) // sumsq stride
+ mov x8, #(2*SUM_STRIDE) // sum stride
+ sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride
+ sub x1, x1, #(2*SUM_STRIDE) // sum -= stride
+
+ tst w4, #4 // LR_HAVE_TOP
+ b.eq 0f
+ // If have top, read from row -2.
+ sub x5, x0, #(4*SUM_STRIDE)
+ sub x6, x1, #(2*SUM_STRIDE)
+ add w11, w11, #2
+ b 1f
+0:
+ // !LR_HAVE_TOP
+ // If we don't have top, read from row 0 even if
+ // we start writing to row -1.
+ add x5, x0, #(4*SUM_STRIDE)
+ add x6, x1, #(2*SUM_STRIDE)
+1:
+
+ tst w4, #8 // LR_HAVE_BOTTOM
+ b.eq 0f
+ // LR_HAVE_BOTTOM
+ add w3, w3, #2 // Handle h+2 lines with the main loop
+ add w11, w11, #2
+ b 1f
+0:
+ // !LR_HAVE_BOTTOM
+ sub w3, w3, #1 // Handle h-1 lines with the main loop
+1:
+ mov w9, w3 // Backup of h for next loops
+
+1:
+ // Start of horizontal loop; start one vertical filter slice.
+ // Start loading rows into v16-v25 and v26-v30 taking top
+ // padding into consideration.
+ tst w4, #4 // LR_HAVE_TOP
+ ld1 {v16.4s, v17.4s}, [x5], x7
+ ld1 {v26.8h}, [x6], x8
+ b.eq 2f
+ // LR_HAVE_TOP
+ ld1 {v20.4s, v21.4s}, [x5], x7
+ ld1 {v28.8h}, [x6], x8
+ mov v18.16b, v16.16b
+ mov v19.16b, v17.16b
+ mov v27.16b, v26.16b
+ ld1 {v22.4s, v23.4s}, [x5], x7
+ ld1 {v29.8h}, [x6], x8
+ b 3f
+2: // !LR_HAVE_TOP
+ mov v18.16b, v16.16b
+ mov v19.16b, v17.16b
+ mov v27.16b, v26.16b
+ mov v20.16b, v16.16b
+ mov v21.16b, v17.16b
+ mov v28.16b, v26.16b
+ mov v22.16b, v16.16b
+ mov v23.16b, v17.16b
+ mov v29.16b, v26.16b
+
+3:
+ cbz w3, 4f
+ ld1 {v24.4s, v25.4s}, [x5], x7
+ ld1 {v30.8h}, [x6], x8
+
+3:
+ // Start of vertical loop
+ subs w3, w3, #2
+.macro add5
+ add v16.4s, v16.4s, v18.4s
+ add v17.4s, v17.4s, v19.4s
+ add v26.8h, v26.8h, v27.8h
+ add v0.4s, v20.4s, v22.4s
+ add v1.4s, v21.4s, v23.4s
+ add v2.8h, v28.8h, v29.8h
+ add v16.4s, v16.4s, v24.4s
+ add v17.4s, v17.4s, v25.4s
+ add v26.8h, v26.8h, v30.8h
+ add v16.4s, v16.4s, v0.4s
+ add v17.4s, v17.4s, v1.4s
+ add v26.8h, v26.8h, v2.8h
+ st1 {v16.4s, v17.4s}, [x0], x7
+ st1 {v26.8h}, [x1], x8
+.endm
+ add5
+.macro shift2
+ mov v16.16b, v20.16b
+ mov v17.16b, v21.16b
+ mov v26.16b, v28.16b
+ mov v18.16b, v22.16b
+ mov v19.16b, v23.16b
+ mov v27.16b, v29.16b
+ mov v20.16b, v24.16b
+ mov v21.16b, v25.16b
+ mov v28.16b, v30.16b
+.endm
+ shift2
+ add x0, x0, x7
+ add x1, x1, x8
+ b.le 5f
+ ld1 {v22.4s, v23.4s}, [x5], x7
+ ld1 {v29.8h}, [x6], x8
+ ld1 {v24.4s, v25.4s}, [x5], x7
+ ld1 {v30.8h}, [x6], x8
+ b 3b
+
+4:
+ // h == 1, !LR_HAVE_BOTTOM.
+ // Pad the last row with the only content row, and add.
+ mov v24.16b, v22.16b
+ mov v25.16b, v23.16b
+ mov v30.16b, v29.16b
+ add5
+ shift2
+ add x0, x0, x7
+ add x1, x1, x8
+ add5
+ b 6f
+
+5:
+ tst w4, #8 // LR_HAVE_BOTTOM
+ b.ne 6f
+ // !LR_HAVE_BOTTOM
+ cbnz w3, 5f
+ // The intended three edge rows left; output the one at h-2 and
+ // the past edge one at h.
+ ld1 {v22.4s, v23.4s}, [x5], x7
+ ld1 {v29.8h}, [x6], x8
+ // Pad the past-edge row from the last content row.
+ mov v24.16b, v22.16b
+ mov v25.16b, v23.16b
+ mov v30.16b, v29.16b
+ add5
+ shift2
+ add x0, x0, x7
+ add x1, x1, x8
+ // The last two rows are already padded properly here.
+ add5
+ b 6f
+
+5:
+ // w3 == -1, two rows left, output one.
+ // Pad the last two rows from the mid one.
+ mov v22.16b, v20.16b
+ mov v23.16b, v21.16b
+ mov v29.16b, v28.16b
+ mov v24.16b, v20.16b
+ mov v25.16b, v21.16b
+ mov v30.16b, v28.16b
+ add5
+ add x0, x0, x7
+ add x1, x1, x8
+ b 6f
+
+6: // End of one vertical slice.
+ subs w2, w2, #8
+ b.le 0f
+ // Move pointers back up to the top and loop horizontally.
+ // Input pointers
+ msub x5, x7, x11, x5
+ msub x6, x8, x11, x6
+ // Output pointers
+ msub x0, x7, x10, x0
+ msub x1, x8, x10, x1
+ add x0, x0, #32
+ add x1, x1, #16
+ add x5, x5, #32
+ add x6, x6, #16
+ mov w3, w9
+ b 1b
+
+0:
+ ret
+.purgem add5
+endfunc
+
+// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+// const int w, const int h, const int strength);
+// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+// const int w, const int h, const int strength);
+function sgr_calc_ab1_neon, export=1
+ add x3, x3, #2 // h += 2
+ movi v31.4s, #9 // n
+ mov x5, #455
+ mov x8, #SUM_STRIDE
+ b sgr_calc_ab_neon
+endfunc
+
+function sgr_calc_ab2_neon, export=1
+ add x3, x3, #3 // h += 3
+ asr x3, x3, #1 // h /= 2
+ movi v31.4s, #25 // n
+ mov x5, #164
+ mov x8, #(2*SUM_STRIDE)
+endfunc
+
+function sgr_calc_ab_neon
+ movrel x12, X(sgr_x_by_x)
+ ld1 {v16.16b, v17.16b, v18.16b}, [x12]
+ movi v19.16b, #5
+ movi v20.8b, #55 // idx of last 5
+ movi v21.8b, #72 // idx of last 4
+ movi v22.8b, #101 // idx of last 3
+ movi v23.8b, #169 // idx of last 2
+ movi v24.8b, #254 // idx of last 1
+ add x2, x2, #2 // w += 2
+ add x7, x2, #7
+ bic x7, x7, #7 // aligned w
+ sub x7, x8, x7 // increment between rows
+ movi v29.8h, #1, lsl #8
+ dup v28.4s, w4
+ dup v30.4s, w5 // one_by_x
+ sub x0, x0, #(4*(SUM_STRIDE))
+ sub x1, x1, #(2*(SUM_STRIDE))
+ mov x6, x2 // backup of w
+ sub v16.16b, v16.16b, v19.16b
+ sub v17.16b, v17.16b, v19.16b
+ sub v18.16b, v18.16b, v19.16b
+1:
+ subs x2, x2, #8
+ ld1 {v0.4s, v1.4s}, [x0] // a
+ ld1 {v2.8h}, [x1] // b
+ mul v0.4s, v0.4s, v31.4s // a * n
+ mul v1.4s, v1.4s, v31.4s // a * n
+ umull v3.4s, v2.4h, v2.4h // b * b
+ umull2 v4.4s, v2.8h, v2.8h // b * b
+ uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0)
+ uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0)
+ mul v0.4s, v0.4s, v28.4s // p * s
+ mul v1.4s, v1.4s, v28.4s // p * s
+ uqshrn v0.4h, v0.4s, #16
+ uqshrn2 v0.8h, v1.4s, #16
+ uqrshrn v0.8b, v0.8h, #4 // imin(z, 255)
+
+ cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5
+ cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4
+ tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b
+ cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3
+ cmhi v5.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2
+ add v25.8b, v25.8b, v26.8b
+ cmhi v6.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1
+ add v27.8b, v27.8b, v5.8b
+ add v6.8b, v6.8b, v19.8b
+ add v25.8b, v25.8b, v27.8b
+ add v1.8b, v1.8b, v6.8b
+ add v1.8b, v1.8b, v25.8b
+ uxtl v1.8h, v1.8b // x
+
+ umull v3.4s, v1.4h, v2.4h // x * BB[i]
+ umull2 v4.4s, v1.8h, v2.8h // x * BB[i]
+ mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x
+ mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x
+ srshr v3.4s, v3.4s, #12 // AA[i]
+ srshr v4.4s, v4.4s, #12 // AA[i]
+ sub v2.8h, v29.8h, v1.8h // 256 - x
+
+ st1 {v3.4s, v4.4s}, [x0], #32
+ st1 {v2.8h}, [x1], #16
+ b.gt 1b
+
+ subs x3, x3, #1
+ b.le 0f
+ add x0, x0, x7, lsl #2
+ add x1, x1, x7, lsl #1
+ mov x2, x6
+ b 1b
+0:
+ ret
+endfunc