Don't trash the return stack buffer in the NEON loop filter

The NEON loop filter's innermost asm function can return to a different location than the address that called it. This messes up the return stack predictor, causing returns to be mispredicted Instead, rework the function to always return to the address that calls it, and instead return the information needed for the caller to short-circuit storing pixels
author: David Conrad <david_conrad@apple.com> 2022-06-08 01:03:03 +0300
committer: David Conrad <david_conrad@apple.com> 2022-07-11 20:28:11 +0300
commit: d503bb0ccaf104b2f13da0f092e09cc9411b3297 (patch)
tree: 883fd05023f54b55b80e9e37b933ed33029f81e7
parent: 79bc755d19d61644360bef4402fcce69f280ea52 (diff)
2 files changed, 58 insertions, 20 deletions
diff --git a/src/arm/64/loopfilter.S b/src/arm/64/loopfilter.S
index 2b9b5c4..63d5de1 100644
--- a/src/arm/64/loopfilter.S
+++ b/src/arm/64/loopfilter.S
@@ -28,6 +28,11 @@
 #include "src/arm/asm.S"
 #include "util.S"
 
+// depending on how many pixels need to be stored, returns:
+// x14 = (1 << 0) : 0 pixels
+// x14 = (1 << 4) : inner 4 pixels
+// x14 = (1 << 6) : inner 6 pixels
+// x14 = 0 : all pixels
 .macro loop_filter wd
 function lpf_16_wd\wd\()_neon
         uabd            v0.16b,  v22.16b, v23.16b // abs(p1 - p0)
@@ -77,8 +82,10 @@ function lpf_16_wd\wd\()_neon
         mov             x16, v1.d[0]
         mov             x17, v1.d[1]
         adds            x16, x16, x17
-        b.eq            9f                        // if (!fm || wd < 4) return;
-
+        b.ne            9f                        // if (!fm || wd < 4) return;
+        mov             x14, #(1 << 0)
+        ret
+9:
 .if \wd >= 6
         movi            v10.16b, #1
         uabd            v2.16b,  v21.16b, v23.16b // abs(p2 - p0)
@@ -474,20 +481,20 @@ function lpf_16_wd\wd\()_neon
         bif             v11.16b, v29.16b, v15.16b // out q5
 .endif
 
+        mov             x14, #0
         ret
 .if \wd == 16
 7:
         // Return to a shorter epilogue, writing only the inner 6 pixels
-        ret             x13
+        mov             x14, #(1 << 6)
+        ret
 .endif
 .if \wd >= 8
 8:
         // Return to a shorter epilogue, writing only the inner 4 pixels
-        ret             x14
+        mov             x14, #(1 << 4)
+        ret
 .endif
-9:
-        // Return directly without writing back any pixels
-        ret             x15
 endfunc
 .endm
 
@@ -497,22 +504,34 @@ loop_filter 6
 loop_filter 4
 
 .macro lpf_16_wd16
-        adr             x13, 7f
-        adr             x14, 8f
         bl              lpf_16_wd16_neon
+        cbz             x14, 1f
+        tbnz            x14, #6, 7f
+        tbnz            x14, #4, 8f
+        ret             x15
+1:
 .endm
 
 .macro lpf_16_wd8
-        adr             x14, 8f
         bl              lpf_16_wd8_neon
+        cbz             x14, 1f
+        tbnz            x14, #4, 8f
+        ret             x15
+1:
 .endm
 
 .macro lpf_16_wd6
         bl              lpf_16_wd6_neon
+        cbz             x14, 1f
+        ret             x15
+1:
 .endm
 
 .macro lpf_16_wd4
         bl              lpf_16_wd4_neon
+        cbz             x14, 1f
+        ret             x15
+1:
 .endm
 
 function lpf_v_4_16_neon
diff --git a/src/arm/64/loopfilter16.S b/src/arm/64/loopfilter16.S
index aab0230..d181a3e 100644
--- a/src/arm/64/loopfilter16.S
+++ b/src/arm/64/loopfilter16.S
@@ -28,6 +28,11 @@
 #include "src/arm/asm.S"
 #include "util.S"
 
+// depending on how many pixels need to be stored, returns:
+// x14 = (1 << 0) : 0 pixels
+// x14 = (1 << 4) : inner 4 pixels
+// x14 = (1 << 6) : inner 6 pixels
+// x14 = 0 : all pixels
 .macro loop_filter wd
 function lpf_8_wd\wd\()_neon
         uabd            v0.8h,   v22.8h,  v23.8h  // abs(p1 - p0)
@@ -77,8 +82,10 @@ function lpf_8_wd\wd\()_neon
         mov             x16, v1.d[0]
         mov             x17, v1.d[1]
         adds            x16, x16, x17
-        b.eq            9f                        // if (!fm || wd < 4) return;
-
+        b.ne            9f                        // if (!fm || wd < 4) return;
+        mov             x14, #(1 << 0)
+        ret
+9:
 .if \wd >= 6
         movi            v10.8h,  #1
         uabd            v2.8h,   v21.8h,  v23.8h  // abs(p2 - p0)
@@ -360,20 +367,20 @@ function lpf_8_wd\wd\()_neon
         bif             v11.16b, v29.16b, v15.16b // out q5
 .endif
 
+        mov             x14, #0
         ret
 .if \wd == 16
 7:
         // Return to a shorter epilogue, writing only the inner 6 pixels
-        ret             x13
+        mov             x14, #(1 << 6)
+        ret
 .endif
 .if \wd >= 8
 8:
         // Return to a shorter epilogue, writing only the inner 4 pixels
-        ret             x14
+        mov             x14, #(1 << 4)
+        ret
 .endif
-9:
-        // Return directly without writing back any pixels
-        ret             x15
 endfunc
 .endm
 
@@ -383,22 +390,34 @@ loop_filter 6
 loop_filter 4
 
 .macro lpf_8_wd16
-        adr             x13, 7f
-        adr             x14, 8f
         bl              lpf_8_wd16_neon
+        cbz             x14, 1f
+        tbnz            x14, #6, 7f
+        tbnz            x14, #4, 8f
+        ret             x15
+1:
 .endm
 
 .macro lpf_8_wd8
-        adr             x14, 8f
         bl              lpf_8_wd8_neon
+        cbz             x14, 1f
+        tbnz            x14, #4, 8f
+        ret             x15
+1:
 .endm
 
 .macro lpf_8_wd6
         bl              lpf_8_wd6_neon
+        cbz             x14, 1f
+        ret             x15
+1:
 .endm
 
 .macro lpf_8_wd4
         bl              lpf_8_wd4_neon
+        cbz             x14, 1f
+        ret             x15
+1:
 .endm
 
 function lpf_v_4_8_neon
author	David Conrad <david_conrad@apple.com>	2022-06-08 01:03:03 +0300
committer	David Conrad <david_conrad@apple.com>	2022-07-11 20:28:11 +0300
commit	d503bb0ccaf104b2f13da0f092e09cc9411b3297 (patch)
tree	883fd05023f54b55b80e9e37b933ed33029f81e7
parent	79bc755d19d61644360bef4402fcce69f280ea52 (diff)