Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Conrad <david_conrad@apple.com>2022-06-08 01:03:03 +0300
committerDavid Conrad <david_conrad@apple.com>2022-07-11 20:28:11 +0300
commitd503bb0ccaf104b2f13da0f092e09cc9411b3297 (patch)
tree883fd05023f54b55b80e9e37b933ed33029f81e7
parent79bc755d19d61644360bef4402fcce69f280ea52 (diff)
Don't trash the return stack buffer in the NEON loop filter
The NEON loop filter's innermost asm function can return to a different location than the address that called it. This messes up the return stack predictor, causing returns to be mispredicted Instead, rework the function to always return to the address that calls it, and instead return the information needed for the caller to short-circuit storing pixels
-rw-r--r--src/arm/64/loopfilter.S39
-rw-r--r--src/arm/64/loopfilter16.S39
2 files changed, 58 insertions, 20 deletions
diff --git a/src/arm/64/loopfilter.S b/src/arm/64/loopfilter.S
index 2b9b5c4..63d5de1 100644
--- a/src/arm/64/loopfilter.S
+++ b/src/arm/64/loopfilter.S
@@ -28,6 +28,11 @@
#include "src/arm/asm.S"
#include "util.S"
+// depending on how many pixels need to be stored, returns:
+// x14 = (1 << 0) : 0 pixels
+// x14 = (1 << 4) : inner 4 pixels
+// x14 = (1 << 6) : inner 6 pixels
+// x14 = 0 : all pixels
.macro loop_filter wd
function lpf_16_wd\wd\()_neon
uabd v0.16b, v22.16b, v23.16b // abs(p1 - p0)
@@ -77,8 +82,10 @@ function lpf_16_wd\wd\()_neon
mov x16, v1.d[0]
mov x17, v1.d[1]
adds x16, x16, x17
- b.eq 9f // if (!fm || wd < 4) return;
-
+ b.ne 9f // if (!fm || wd < 4) return;
+ mov x14, #(1 << 0)
+ ret
+9:
.if \wd >= 6
movi v10.16b, #1
uabd v2.16b, v21.16b, v23.16b // abs(p2 - p0)
@@ -474,20 +481,20 @@ function lpf_16_wd\wd\()_neon
bif v11.16b, v29.16b, v15.16b // out q5
.endif
+ mov x14, #0
ret
.if \wd == 16
7:
// Return to a shorter epilogue, writing only the inner 6 pixels
- ret x13
+ mov x14, #(1 << 6)
+ ret
.endif
.if \wd >= 8
8:
// Return to a shorter epilogue, writing only the inner 4 pixels
- ret x14
+ mov x14, #(1 << 4)
+ ret
.endif
-9:
- // Return directly without writing back any pixels
- ret x15
endfunc
.endm
@@ -497,22 +504,34 @@ loop_filter 6
loop_filter 4
.macro lpf_16_wd16
- adr x13, 7f
- adr x14, 8f
bl lpf_16_wd16_neon
+ cbz x14, 1f
+ tbnz x14, #6, 7f
+ tbnz x14, #4, 8f
+ ret x15
+1:
.endm
.macro lpf_16_wd8
- adr x14, 8f
bl lpf_16_wd8_neon
+ cbz x14, 1f
+ tbnz x14, #4, 8f
+ ret x15
+1:
.endm
.macro lpf_16_wd6
bl lpf_16_wd6_neon
+ cbz x14, 1f
+ ret x15
+1:
.endm
.macro lpf_16_wd4
bl lpf_16_wd4_neon
+ cbz x14, 1f
+ ret x15
+1:
.endm
function lpf_v_4_16_neon
diff --git a/src/arm/64/loopfilter16.S b/src/arm/64/loopfilter16.S
index aab0230..d181a3e 100644
--- a/src/arm/64/loopfilter16.S
+++ b/src/arm/64/loopfilter16.S
@@ -28,6 +28,11 @@
#include "src/arm/asm.S"
#include "util.S"
+// depending on how many pixels need to be stored, returns:
+// x14 = (1 << 0) : 0 pixels
+// x14 = (1 << 4) : inner 4 pixels
+// x14 = (1 << 6) : inner 6 pixels
+// x14 = 0 : all pixels
.macro loop_filter wd
function lpf_8_wd\wd\()_neon
uabd v0.8h, v22.8h, v23.8h // abs(p1 - p0)
@@ -77,8 +82,10 @@ function lpf_8_wd\wd\()_neon
mov x16, v1.d[0]
mov x17, v1.d[1]
adds x16, x16, x17
- b.eq 9f // if (!fm || wd < 4) return;
-
+ b.ne 9f // if (!fm || wd < 4) return;
+ mov x14, #(1 << 0)
+ ret
+9:
.if \wd >= 6
movi v10.8h, #1
uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0)
@@ -360,20 +367,20 @@ function lpf_8_wd\wd\()_neon
bif v11.16b, v29.16b, v15.16b // out q5
.endif
+ mov x14, #0
ret
.if \wd == 16
7:
// Return to a shorter epilogue, writing only the inner 6 pixels
- ret x13
+ mov x14, #(1 << 6)
+ ret
.endif
.if \wd >= 8
8:
// Return to a shorter epilogue, writing only the inner 4 pixels
- ret x14
+ mov x14, #(1 << 4)
+ ret
.endif
-9:
- // Return directly without writing back any pixels
- ret x15
endfunc
.endm
@@ -383,22 +390,34 @@ loop_filter 6
loop_filter 4
.macro lpf_8_wd16
- adr x13, 7f
- adr x14, 8f
bl lpf_8_wd16_neon
+ cbz x14, 1f
+ tbnz x14, #6, 7f
+ tbnz x14, #4, 8f
+ ret x15
+1:
.endm
.macro lpf_8_wd8
- adr x14, 8f
bl lpf_8_wd8_neon
+ cbz x14, 1f
+ tbnz x14, #4, 8f
+ ret x15
+1:
.endm
.macro lpf_8_wd6
bl lpf_8_wd6_neon
+ cbz x14, 1f
+ ret x15
+1:
.endm
.macro lpf_8_wd4
bl lpf_8_wd4_neon
+ cbz x14, 1f
+ ret x15
+1:
.endm
function lpf_v_4_8_neon