arm64: Change br instructions to ret for function returns

Using ret x<n> instead of br x<n> removes the need for a BTI landing pad at the target address in x<n>. Using 'ret' instead of 'br' does not have any performance implications. Signed-off-by: Jonathan Wright <jonathan.wright@arm.com> Signed-off-by: Matthew Dalzell <matthew.dalzell@arm.com> Signed-off-by: Salome Thirot <salome.thirot@arm.com>
author: Salome Thirot <salome.thirot@arm.com> 2021-10-15 19:05:03 +0300
committer: Salome Thirot <salome.thirot@arm.com> 2021-10-27 14:09:37 +0300
commit: da0da5488b5a5d43c4fef6d4308a0cd930f7961a (patch)
tree: d95934a5194b64f065a0c5d670d4a47e89caed3e /src/arm
parent: eb0308bcdf1bfd651f2359ed49a2f6dc258aed86 (diff)
6 files changed, 96 insertions, 96 deletions
diff --git a/src/arm/64/itx.S b/src/arm/64/itx.S
index 98147a3..ea0de6a 100644
--- a/src/arm/64/itx.S
+++ b/src/arm/64/itx.S
@@ -660,7 +660,7 @@ L(itx_4x4_end):
         st1             {v1.s}[0], [x0], x1
         st1             {v1.s}[1], [x0], x1
 
-        br              x15
+        ret             x15
 endfunc
 
 .macro def_fn_4x4 txfm1, txfm2
@@ -897,7 +897,7 @@ function inv_txfm_\variant\()add_8x8_neon
         blr             x5
 
         load_add_store_8x8 x0, x7
-        br              x15
+        ret             x15
 endfunc
 .endm
 
@@ -962,7 +962,7 @@ function inv_txfm_add_8x4_neon
         blr             x5
 
         load_add_store_8x4 x0, x7
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_4x8_neon
@@ -988,7 +988,7 @@ function inv_txfm_add_4x8_neon
         blr             x5
 
         load_add_store_4x8 x0, x7
-        br              x15
+        ret             x15
 endfunc
 
 .macro def_fn_48 w, h, txfm1, txfm2
@@ -1404,7 +1404,7 @@ function inv_txfm_horz\suffix\()_16x8_neon
         st1             {\i}, [x6], #16
 .endr
 
-        br              x14
+        ret             x14
 endfunc
 .endm
 
@@ -1419,7 +1419,7 @@ function inv_txfm_add_vert_8x16_neon
 .endr
         blr             x5
         load_add_store_8x16 x6, x7
-        br              x14
+        ret             x14
 endfunc
 
 function inv_txfm_add_16x16_neon
@@ -1453,7 +1453,7 @@ function inv_txfm_add_16x16_neon
 .endr
 
         add             sp,  sp,  #512
-        br              x15
+        ret             x15
 endfunc
 
 .macro def_fn_16x16 txfm1, txfm2, eob_half
@@ -1553,7 +1553,7 @@ function inv_txfm_\variant\()add_16x4_neon
         add             x6,  x0,  #8
         load_add_store_8x4 x6, x7
 
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_\variant\()add_4x16_neon
@@ -1621,7 +1621,7 @@ function inv_txfm_\variant\()add_4x16_neon
 
         load_add_store_4x16 x0, x6
 
-        br              x15
+        ret             x15
 endfunc
 .endm
 
@@ -1731,7 +1731,7 @@ function inv_txfm_\variant\()add_16x8_neon
         add             x0,  x0,  #8
         load_add_store_8x8 x0, x7
 
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_\variant\()add_8x16_neon
@@ -1804,7 +1804,7 @@ function inv_txfm_\variant\()add_8x16_neon
 
         load_add_store_8x16 x0, x6
 
-        br              x15
+        ret             x15
 endfunc
 .endm
 
@@ -2089,7 +2089,7 @@ function inv_txfm_horz\suffix\()_dct_32x8_neon
         store2          v25.8h,  v17.8h, \shift
         store2          v24.8h,  v16.8h, \shift
 .purgem store2
-        br              x14
+        ret             x14
 endfunc
 .endm
 
@@ -2163,7 +2163,7 @@ function inv_txfm_add_vert_dct_8x32_neon
         combine         v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
 .purgem combine
 
-        br              x14
+        ret             x14
 endfunc
 
 const eob_32x32
@@ -2374,7 +2374,7 @@ function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1
 .endr
 
         add             sp,  sp,  #2048
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1
@@ -2423,7 +2423,7 @@ function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1
 .endr
 
         add             sp,  sp,  #1024
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1
@@ -2468,7 +2468,7 @@ function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1
 .endr
 
         add             sp,  sp,  #1024
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1
@@ -2525,7 +2525,7 @@ function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1
         bl              inv_txfm_add_vert_dct_8x32_neon
 
         add             sp,  sp,  #512
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1
@@ -2559,7 +2559,7 @@ function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1
         b.lt            1b
 
         add             sp,  sp,  #512
-        br              x15
+        ret             x15
 endfunc
 
 function inv_dct64_step1_neon
@@ -2886,7 +2886,7 @@ function inv_txfm_dct\suffix\()_8h_x64_neon, export=1
 
         bl              inv_dct64_step2_neon
 
-        br              x14
+        ret             x14
 endfunc
 .endm
 
@@ -2943,7 +2943,7 @@ function inv_txfm_horz_dct_64x8_neon
 
         cmp             x7,  x8
         b.lt            1b
-        br              x14
+        ret             x14
 endfunc
 
 function inv_txfm_add_vert_dct_8x64_neon
@@ -2999,7 +2999,7 @@ function inv_txfm_add_vert_dct_8x64_neon
         cmp             x7,  x8
         b.lt            1b
 
-        br              x14
+        ret             x14
 endfunc
 
 function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
@@ -3053,7 +3053,7 @@ function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
 .endr
 
         add             sp,  x5,  #64*32*2
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1
@@ -3106,7 +3106,7 @@ function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1
 .endr
 
         add             sp,  x5,  #64*32*2
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
@@ -3158,7 +3158,7 @@ function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
 .endr
 
         add             sp,  x5,  #32*32*2
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
@@ -3212,7 +3212,7 @@ function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
 .endr
 
         add             sp,  x4,  #64*16*2
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
@@ -3265,5 +3265,5 @@ function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
 .endr
 
         add             sp,  x5,  #16*32*2
-        br              x15
+        ret             x15
 endfunc
diff --git a/src/arm/64/itx16.S b/src/arm/64/itx16.S
index 5e55dd3..d1ac4bc 100644
--- a/src/arm/64/itx16.S
+++ b/src/arm/64/itx16.S
@@ -541,7 +541,7 @@ L(itx_4x4_end):
         st1             {v1.d}[0], [x0], x1
         st1             {v1.d}[1], [x0], x1
 
-        br              x15
+        ret             x15
 endfunc
 
 .macro def_fn_4x4 txfm1, txfm2
@@ -784,7 +784,7 @@ function inv_txfm_add_8x8_neon
         blr             x5
 
         load_add_store_8x8 x0, x7
-        br              x15
+        ret             x15
 endfunc
 
 .macro def_fn_8x8 txfm1, txfm2, eob_half
@@ -853,7 +853,7 @@ function inv_txfm_add_8x4_neon
         blr             x5
 
         load_add_store_8x4 x0, x7
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_4x8_neon
@@ -902,7 +902,7 @@ function inv_txfm_add_4x8_neon
         blr             x5
 
         load_add_store_4x8 x0, x7
-        br              x15
+        ret             x15
 endfunc
 
 .macro def_fn_48 w, h, txfm1, txfm2, eob_half
@@ -1282,7 +1282,7 @@ function inv_txfm_horz\suffix\()_16x4_neon
         st1             {\i}, [x6], #16
 .endr
 
-        br              x14
+        ret             x14
 endfunc
 .endm
 
@@ -1296,7 +1296,7 @@ function inv_txfm_add_vert_8x16_neon
 .endr
         blr             x5
         load_add_store_8x16 x6, x7
-        br              x14
+        ret             x14
 endfunc
 
 function inv_txfm_add_16x16_neon
@@ -1338,7 +1338,7 @@ function inv_txfm_add_16x16_neon
 .endr
 
         add             sp,  sp,  #512
-        br              x15
+        ret             x15
 endfunc
 
 const eob_16x16
@@ -1423,7 +1423,7 @@ function inv_txfm_add_16x4_neon
         add             x6,  x0,  #16
         load_add_store_8x4 x6, x7
 
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_4x16_neon
@@ -1517,7 +1517,7 @@ function inv_txfm_add_4x16_neon
 
         load_add_store_4x16 x0, x6
 
-        br              x15
+        ret             x15
 endfunc
 
 const eob_4x16
@@ -1698,7 +1698,7 @@ function inv_txfm_add_16x8_neon
         ldp             d12, d13, [sp, #0x20]
         ldp             d10, d11, [sp, #0x10]
         ldp             d8,  d9,  [sp], 0x40
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_8x16_neon
@@ -1839,7 +1839,7 @@ function inv_txfm_add_8x16_neon
         ldp             d10, d11, [sp, #0x10]
         ldp             d8,  d9,  [sp], 0x20
 
-        br              x15
+        ret             x15
 endfunc
 
 const eob_8x16
@@ -2141,7 +2141,7 @@ function inv_txfm_horz\suffix\()_dct_32x4_neon
         store2          v29.4s,  v25.4s,  v21.4s,  v17.4s,  \shift
         store2          v28.4s,  v24.4s,  v20.4s,  v16.4s,  \shift
 .purgem store2
-        br              x14
+        ret             x14
 endfunc
 .endm
 
@@ -2216,7 +2216,7 @@ function inv_txfm_add_vert_dct_8x32_neon
         combine         v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
 .purgem combine
 
-        br              x14
+        ret             x14
 endfunc
 
 const eob_32x32
@@ -2533,7 +2533,7 @@ function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1
 .endr
 
         add             sp,  sp,  #2048
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1
@@ -2582,7 +2582,7 @@ function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1
 .endr
 
         add             sp,  sp,  #1024
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
@@ -2632,7 +2632,7 @@ function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
 .endr
 
         add             sp,  sp,  #1024
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1
@@ -2692,7 +2692,7 @@ function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1
         bl              inv_txfm_add_vert_dct_8x32_neon
 
         add             sp,  sp,  #512
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1
@@ -2743,7 +2743,7 @@ function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1
         b.lt            1b
 
         add             sp,  sp,  #512
-        br              x15
+        ret             x15
 endfunc
 
 function inv_dct64_step1_neon
@@ -3070,7 +3070,7 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
 
         bl              inv_dct64_step2_neon
 
-        br              x14
+        ret             x14
 endfunc
 .endm
 
@@ -3127,7 +3127,7 @@ function inv_txfm_horz_dct_64x4_neon
 
         cmp             x7,  x8
         b.lt            1b
-        br              x14
+        ret             x14
 endfunc
 
 function inv_txfm_add_vert_dct_8x64_neon
@@ -3184,7 +3184,7 @@ function inv_txfm_add_vert_dct_8x64_neon
         cmp             x7,  x8
         b.lt            1b
 
-        br              x14
+        ret             x14
 endfunc
 
 function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
@@ -3238,7 +3238,7 @@ function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
 .endr
 
         add             sp,  x5,  #64*32*2
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1
@@ -3291,7 +3291,7 @@ function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1
 .endr
 
         add             sp,  x5,  #64*32*2
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1
@@ -3341,7 +3341,7 @@ function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1
 .endr
 
         add             sp,  x5,  #32*32*2
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1
@@ -3395,7 +3395,7 @@ function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1
 .endr
 
         add             sp,  x4,  #64*16*2
-        br              x15
+        ret             x15
 endfunc
 
 function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
@@ -3448,5 +3448,5 @@ function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
 .endr
 
         add             sp,  x5,  #16*32*2
-        br              x15
+        ret             x15
 endfunc
diff --git a/src/arm/64/loopfilter.S b/src/arm/64/loopfilter.S
index 1d426f1..2b9b5c4 100644
--- a/src/arm/64/loopfilter.S
+++ b/src/arm/64/loopfilter.S
@@ -478,16 +478,16 @@ function lpf_16_wd\wd\()_neon
 .if \wd == 16
 7:
         // Return to a shorter epilogue, writing only the inner 6 pixels
-        br              x13
+        ret             x13
 .endif
 .if \wd >= 8
 8:
         // Return to a shorter epilogue, writing only the inner 4 pixels
-        br              x14
+        ret             x14
 .endif
 9:
         // Return directly without writing back any pixels
-        br              x15
+        ret             x15
 endfunc
 .endm
 
@@ -532,7 +532,7 @@ function lpf_v_4_16_neon
         st1             {v23.16b}, [x16], x1 // p0
         st1             {v25.16b}, [x0],  x1 // q1
         sub             x0,  x0,  x1, lsl #1
-        br              x15
+        ret             x15
 endfunc
 
 function lpf_h_4_16_neon
@@ -583,7 +583,7 @@ function lpf_h_4_16_neon
         st1             {v25.s}[1], [x16], x1
         st1             {v25.s}[3], [x0],  x1
         add             x0,  x0,  #2
-        br              x15
+        ret             x15
 endfunc
 
 function lpf_v_6_16_neon
@@ -607,7 +607,7 @@ function lpf_v_6_16_neon
         st1             {v23.16b}, [x16], x1 // p0
         st1             {v25.16b}, [x0],  x1 // q1
         sub             x0,  x0,  x1, lsl #1
-        br              x15
+        ret             x15
 endfunc
 
 function lpf_h_6_16_neon
@@ -658,7 +658,7 @@ function lpf_h_6_16_neon
         st1             {v25.s}[1], [x16], x1
         st1             {v25.s}[3], [x0],  x1
         add             x0,  x0,  #2
-        br              x15
+        ret             x15
 endfunc
 
 function lpf_v_8_16_neon
@@ -686,7 +686,7 @@ function lpf_v_8_16_neon
         st1             {v26.16b}, [x0],  x1 // q2
         sub             x0,  x0,  x1, lsl #1
         sub             x0,  x0,  x1
-        br              x15
+        ret             x15
 
 8:
         sub             x16, x0,  x1, lsl #1
@@ -695,7 +695,7 @@ function lpf_v_8_16_neon
         st1             {v23.16b}, [x16], x1 // p0
         st1             {v25.16b}, [x0],  x1 // q1
         sub             x0,  x0,  x1, lsl #1
-        br              x15
+        ret             x15
 endfunc
 
 function lpf_h_8_16_neon
@@ -746,7 +746,7 @@ function lpf_h_8_16_neon
         st1             {v27.d}[0], [x16], x1
         st1             {v27.d}[1], [x0],  x1
         add             x0,  x0,  #4
-        br              x15
+        ret             x15
 8:
         sub             x16, x0,  x1, lsl #4
         sub             x16, x16, #2
@@ -770,7 +770,7 @@ function lpf_h_8_16_neon
         st1             {v25.s}[1], [x16], x1
         st1             {v25.s}[3], [x0],  x1
         add             x0,  x0,  #2
-        br              x15
+        ret             x15
 endfunc
 
 function lpf_v_16_16_neon
@@ -813,7 +813,7 @@ function lpf_v_16_16_neon
         st1             {v11.16b}, [x0],  x1 // q5
         sub             x0,  x0,  x1, lsl #2
         sub             x0,  x0,  x1, lsl #1
-        br              x15
+        ret             x15
 7:
         sub             x16, x0,  x1
         sub             x16, x16, x1, lsl #1
@@ -825,7 +825,7 @@ function lpf_v_16_16_neon
         st1             {v26.16b}, [x0],  x1 // q2
         sub             x0,  x0,  x1, lsl #1
         sub             x0,  x0,  x1
-        br              x15
+        ret             x15
 
 8:
         sub             x16, x0,  x1, lsl #1
@@ -834,7 +834,7 @@ function lpf_v_16_16_neon
         st1             {v23.16b}, [x16], x1 // p0
         st1             {v25.16b}, [x0],  x1 // q1
         sub             x0,  x0,  x1, lsl #1
-        br              x15
+        ret             x15
 endfunc
 
 function lpf_h_16_16_neon
@@ -916,7 +916,7 @@ function lpf_h_16_16_neon
         st1             {v30.d}[1], [x0],  x1
         st1             {v5.d}[1],  [x16], x1
         st1             {v31.d}[1], [x0],  x1
-        br              x15
+        ret             x15
 
 7:
         sub             x16, x0,  x1, lsl #4
@@ -941,7 +941,7 @@ function lpf_h_16_16_neon
         st1             {v27.d}[0], [x16], x1
         st1             {v27.d}[1], [x0],  x1
         add             x0,  x0,  #4
-        br              x15
+        ret             x15
 8:
         sub             x16, x0,  x1, lsl #4
         sub             x16, x16, #2
@@ -965,7 +965,7 @@ function lpf_h_16_16_neon
         st1             {v25.s}[1], [x16], x1
         st1             {v25.s}[3], [x0],  x1
         add             x0,  x0,  #2
-        br              x15
+        ret             x15
 endfunc
 
 // void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -1096,7 +1096,7 @@ function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
         ldp             d12, d13, [sp, #0x20]
         ldp             d10, d11, [sp, #0x10]
         ldp             d8,  d9,  [sp], 0x40
-        br              x11
+        ret             x11
 endfunc
 .endm
 
diff --git a/src/arm/64/loopfilter16.S b/src/arm/64/loopfilter16.S
index 8c9f98b..aab0230 100644
--- a/src/arm/64/loopfilter16.S
+++ b/src/arm/64/loopfilter16.S
@@ -364,16 +364,16 @@ function lpf_8_wd\wd\()_neon
 .if \wd == 16
 7:
         // Return to a shorter epilogue, writing only the inner 6 pixels
-        br              x13
+        ret             x13
 .endif
 .if \wd >= 8
 8:
         // Return to a shorter epilogue, writing only the inner 4 pixels
-        br              x14
+        ret             x14
 .endif
 9:
         // Return directly without writing back any pixels
-        br              x15
+        ret             x15
 endfunc
 .endm
 
@@ -418,7 +418,7 @@ function lpf_v_4_8_neon
         st1             {v23.8h}, [x16], x1 // p0
         st1             {v25.8h}, [x0],  x1 // q1
         sub             x0,  x0,  x1, lsl #1
-        br              x15
+        ret             x15
 endfunc
 
 function lpf_h_4_8_neon
@@ -453,7 +453,7 @@ function lpf_h_4_8_neon
         st1             {v25.d}[0], [x16], x1
         st1             {v25.d}[1], [x0],  x1
         add             x0,  x0,  #4
-        br              x15
+        ret             x15
 endfunc
 
 function lpf_v_6_8_neon
@@ -477,7 +477,7 @@ function lpf_v_6_8_neon
         st1             {v23.8h}, [x16], x1 // p0
         st1             {v25.8h}, [x0],  x1 // q1
         sub             x0,  x0,  x1, lsl #1
-        br              x15
+        ret             x15
 endfunc
 
 function lpf_h_6_8_neon
@@ -512,7 +512,7 @@ function lpf_h_6_8_neon
         st1             {v25.d}[0], [x16], x1
         st1             {v25.d}[1], [x0],  x1
         add             x0,  x0,  #4
-        br              x15
+        ret             x15
 endfunc
 
 function lpf_v_8_8_neon
@@ -540,7 +540,7 @@ function lpf_v_8_8_neon
         st1             {v26.8h}, [x0],  x1 // q2
         sub             x0,  x0,  x1, lsl #1
         sub             x0,  x0,  x1
-        br              x15
+        ret             x15
 
 8:
         sub             x16, x0,  x1, lsl #1
@@ -549,7 +549,7 @@ function lpf_v_8_8_neon
         st1             {v23.8h}, [x16], x1 // p0
         st1             {v25.8h}, [x0],  x1 // q1
         sub             x0,  x0,  x1, lsl #1
-        br              x15
+        ret             x15
 endfunc
 
 function lpf_h_8_8_neon
@@ -584,7 +584,7 @@ function lpf_h_8_8_neon
         st1             {v23.8h}, [x16], x1
         st1             {v27.8h}, [x0],  x1
         add             x0,  x0,  #8
-        br              x15
+        ret             x15
 8:
         sub             x16, x0,  x1, lsl #3
         sub             x16, x16, #4
@@ -600,7 +600,7 @@ function lpf_h_8_8_neon
         st1             {v25.d}[0], [x16], x1
         st1             {v25.d}[1], [x0],  x1
         add             x0,  x0,  #4
-        br              x15
+        ret             x15
 endfunc
 
 function lpf_v_16_8_neon
@@ -643,7 +643,7 @@ function lpf_v_16_8_neon
         st1             {v11.8h}, [x0],  x1 // q5
         sub             x0,  x0,  x1, lsl #2
         sub             x0,  x0,  x1, lsl #1
-        br              x15
+        ret             x15
 7:
         sub             x16, x0,  x1
         sub             x16, x16, x1, lsl #1
@@ -655,7 +655,7 @@ function lpf_v_16_8_neon
         st1             {v26.8h}, [x0],  x1 // q2
         sub             x0,  x0,  x1, lsl #1
         sub             x0,  x0,  x1
-        br              x15
+        ret             x15
 
 8:
         sub             x16, x0,  x1, lsl #1
@@ -664,7 +664,7 @@ function lpf_v_16_8_neon
         st1             {v23.8h}, [x16], x1 // p0
         st1             {v25.8h}, [x0],  x1 // q1
         sub             x0,  x0,  x1, lsl #1
-        br              x15
+        ret             x15
 endfunc
 
 function lpf_h_16_8_neon
@@ -714,7 +714,7 @@ function lpf_h_16_8_neon
         st1             {v30.8h}, [x0],  x1
         st1             {v5.8h},  [x16], x1
         st1             {v31.8h}, [x0],  x1
-        br              x15
+        ret             x15
 
 7:
         sub             x16, x0,  x1, lsl #3
@@ -731,7 +731,7 @@ function lpf_h_16_8_neon
         st1             {v23.8h}, [x16], x1
         st1             {v27.8h}, [x0],  x1
         add             x0,  x0,  #8
-        br              x15
+        ret             x15
 8:
         sub             x16, x0,  x1, lsl #3
         sub             x16, x16, #4
@@ -747,7 +747,7 @@ function lpf_h_16_8_neon
         st1             {v25.d}[0], [x16], x1
         st1             {v25.d}[1], [x0],  x1
         add             x0,  x0,  #4
-        br              x15
+        ret             x15
 endfunc
 
 // void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -892,7 +892,7 @@ function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1
         ldp             d12, d13, [sp, #0x20]
         ldp             d10, d11, [sp, #0x10]
         ldp             d8,  d9,  [sp], 0x40
-        br              x11
+        ret             x11
 endfunc
 .endm
 
diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
index ef7f23b..8a80b39 100644
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -1979,7 +1979,7 @@ L(\type\()_8tap_hv):
         b               28b
 
 0:
-        br              x15
+        ret             x15
 
 L(\type\()_8tap_filter_2):
         ld1             {v28.8b},  [\sr2], \s_strd
@@ -2135,7 +2135,7 @@ L(\type\()_8tap_filter_2):
         mov             v22.8b,  v29.8b
         b               48b
 0:
-        br              x15
+        ret             x15
 
 L(\type\()_8tap_filter_4):
         ld1             {v26.8b}, [\sr2], \s_strd
@@ -2343,7 +2343,7 @@ L(\type\()_8tap_filter_4):
 .endif
         b               168b
 0:
-        br              x15
+        ret             x15
 
 L(\type\()_8tap_filter_8_first):
         ld1             {v28.8b, v29.8b},  [\src], \s_strd
@@ -3072,7 +3072,7 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
         add             w6,  w6,  w4
         b.gt            1b
 
-        br              x15
+        ret             x15
 endfunc
 .endm
 
diff --git a/src/arm/64/mc16.S b/src/arm/64/mc16.S
index cec82a3..05bfa39 100644
--- a/src/arm/64/mc16.S
+++ b/src/arm/64/mc16.S
@@ -2139,7 +2139,7 @@ L(\type\()_8tap_hv):
         b               28b
 
 0:
-        br              x15
+        ret             x15
 
 L(\type\()_8tap_filter_2):
         ld1             {v25.8h},  [\sr2], \s_strd
@@ -2304,7 +2304,7 @@ L(\type\()_8tap_filter_2):
         mov             v22.8b,  v25.8b
         b               48b
 0:
-        br              x15
+        ret             x15
 
 L(\type\()_8tap_filter_4):
         ld1             {v24.8h}, [\sr2], \s_strd
@@ -2554,7 +2554,7 @@ L(\type\()_8tap_filter_4):
         add             \dst,  \dst,  #16
         b               168b
 0:
-        br              x15
+        ret             x15
 
 L(\type\()_8tap_filter_8):
         ld1             {v4.8h, v5.8h},  [\sr2], \s_strd
@@ -3398,7 +3398,7 @@ function warp_affine_8x8\t\()_16bpc_neon, export=1
         ldp             d10, d11, [sp, #0x10]
         ldp             d8,  d9,  [sp], 0x40
 
-        br              x15
+        ret             x15
 endfunc
 .endm
author	Salome Thirot <salome.thirot@arm.com>	2021-10-15 19:05:03 +0300
committer	Salome Thirot <salome.thirot@arm.com>	2021-10-27 14:09:37 +0300
commit	da0da5488b5a5d43c4fef6d4308a0cd930f7961a (patch)
tree	d95934a5194b64f065a0c5d670d4a47e89caed3e /src/arm
parent	eb0308bcdf1bfd651f2359ed49a2f6dc258aed86 (diff)