diff options
author | David Conrad <david_conrad@apple.com> | 2022-06-08 01:03:03 +0300 |
---|---|---|
committer | David Conrad <david_conrad@apple.com> | 2022-07-11 20:28:11 +0300 |
commit | d503bb0ccaf104b2f13da0f092e09cc9411b3297 (patch) | |
tree | 883fd05023f54b55b80e9e37b933ed33029f81e7 | |
parent | 79bc755d19d61644360bef4402fcce69f280ea52 (diff) |
Don't trash the return stack buffer in the NEON loop filter
The NEON loop filter's innermost asm function can return to a different
location than the address that called it. This messes up the return stack
predictor, causing returns to be mispredicted
Instead, rework the function to always return to the address that calls it,
and instead return the information needed for the caller to short-circuit
storing pixels
-rw-r--r-- | src/arm/64/loopfilter.S | 39 | ||||
-rw-r--r-- | src/arm/64/loopfilter16.S | 39 |
2 files changed, 58 insertions, 20 deletions
diff --git a/src/arm/64/loopfilter.S b/src/arm/64/loopfilter.S index 2b9b5c4..63d5de1 100644 --- a/src/arm/64/loopfilter.S +++ b/src/arm/64/loopfilter.S @@ -28,6 +28,11 @@ #include "src/arm/asm.S" #include "util.S" +// depending on how many pixels need to be stored, returns: +// x14 = (1 << 0) : 0 pixels +// x14 = (1 << 4) : inner 4 pixels +// x14 = (1 << 6) : inner 6 pixels +// x14 = 0 : all pixels .macro loop_filter wd function lpf_16_wd\wd\()_neon uabd v0.16b, v22.16b, v23.16b // abs(p1 - p0) @@ -77,8 +82,10 @@ function lpf_16_wd\wd\()_neon mov x16, v1.d[0] mov x17, v1.d[1] adds x16, x16, x17 - b.eq 9f // if (!fm || wd < 4) return; - + b.ne 9f // if (!fm || wd < 4) return; + mov x14, #(1 << 0) + ret +9: .if \wd >= 6 movi v10.16b, #1 uabd v2.16b, v21.16b, v23.16b // abs(p2 - p0) @@ -474,20 +481,20 @@ function lpf_16_wd\wd\()_neon bif v11.16b, v29.16b, v15.16b // out q5 .endif + mov x14, #0 ret .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels - ret x13 + mov x14, #(1 << 6) + ret .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels - ret x14 + mov x14, #(1 << 4) + ret .endif -9: - // Return directly without writing back any pixels - ret x15 endfunc .endm @@ -497,22 +504,34 @@ loop_filter 6 loop_filter 4 .macro lpf_16_wd16 - adr x13, 7f - adr x14, 8f bl lpf_16_wd16_neon + cbz x14, 1f + tbnz x14, #6, 7f + tbnz x14, #4, 8f + ret x15 +1: .endm .macro lpf_16_wd8 - adr x14, 8f bl lpf_16_wd8_neon + cbz x14, 1f + tbnz x14, #4, 8f + ret x15 +1: .endm .macro lpf_16_wd6 bl lpf_16_wd6_neon + cbz x14, 1f + ret x15 +1: .endm .macro lpf_16_wd4 bl lpf_16_wd4_neon + cbz x14, 1f + ret x15 +1: .endm function lpf_v_4_16_neon diff --git a/src/arm/64/loopfilter16.S b/src/arm/64/loopfilter16.S index aab0230..d181a3e 100644 --- a/src/arm/64/loopfilter16.S +++ b/src/arm/64/loopfilter16.S @@ -28,6 +28,11 @@ #include "src/arm/asm.S" #include "util.S" +// depending on how many pixels need to be stored, returns: +// x14 = (1 << 0) : 0 pixels +// x14 = (1 << 4) : inner 4 pixels +// x14 = (1 << 6) : inner 6 pixels +// x14 = 0 : all pixels .macro loop_filter wd function lpf_8_wd\wd\()_neon uabd v0.8h, v22.8h, v23.8h // abs(p1 - p0) @@ -77,8 +82,10 @@ function lpf_8_wd\wd\()_neon mov x16, v1.d[0] mov x17, v1.d[1] adds x16, x16, x17 - b.eq 9f // if (!fm || wd < 4) return; - + b.ne 9f // if (!fm || wd < 4) return; + mov x14, #(1 << 0) + ret +9: .if \wd >= 6 movi v10.8h, #1 uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0) @@ -360,20 +367,20 @@ function lpf_8_wd\wd\()_neon bif v11.16b, v29.16b, v15.16b // out q5 .endif + mov x14, #0 ret .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels - ret x13 + mov x14, #(1 << 6) + ret .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels - ret x14 + mov x14, #(1 << 4) + ret .endif -9: - // Return directly without writing back any pixels - ret x15 endfunc .endm @@ -383,22 +390,34 @@ loop_filter 6 loop_filter 4 .macro lpf_8_wd16 - adr x13, 7f - adr x14, 8f bl lpf_8_wd16_neon + cbz x14, 1f + tbnz x14, #6, 7f + tbnz x14, #4, 8f + ret x15 +1: .endm .macro lpf_8_wd8 - adr x14, 8f bl lpf_8_wd8_neon + cbz x14, 1f + tbnz x14, #4, 8f + ret x15 +1: .endm .macro lpf_8_wd6 bl lpf_8_wd6_neon + cbz x14, 1f + ret x15 +1: .endm .macro lpf_8_wd4 bl lpf_8_wd4_neon + cbz x14, 1f + ret x15 +1: .endm function lpf_v_4_8_neon |