diff options
author | Salome Thirot <salome.thirot@arm.com> | 2021-10-15 19:05:03 +0300 |
---|---|---|
committer | Salome Thirot <salome.thirot@arm.com> | 2021-10-27 14:09:37 +0300 |
commit | da0da5488b5a5d43c4fef6d4308a0cd930f7961a (patch) | |
tree | d95934a5194b64f065a0c5d670d4a47e89caed3e /src/arm | |
parent | eb0308bcdf1bfd651f2359ed49a2f6dc258aed86 (diff) |
arm64: Change br instructions to ret for function returns
Using ret x<n> instead of br x<n> removes the need for a BTI landing pad
at the target address in x<n>.
Using 'ret' instead of 'br' does not have any performance implications.
Signed-off-by: Jonathan Wright <jonathan.wright@arm.com>
Signed-off-by: Matthew Dalzell <matthew.dalzell@arm.com>
Signed-off-by: Salome Thirot <salome.thirot@arm.com>
Diffstat (limited to 'src/arm')
-rw-r--r-- | src/arm/64/itx.S | 52 | ||||
-rw-r--r-- | src/arm/64/itx16.S | 52 | ||||
-rw-r--r-- | src/arm/64/loopfilter.S | 36 | ||||
-rw-r--r-- | src/arm/64/loopfilter16.S | 36 | ||||
-rw-r--r-- | src/arm/64/mc.S | 8 | ||||
-rw-r--r-- | src/arm/64/mc16.S | 8 |
6 files changed, 96 insertions, 96 deletions
diff --git a/src/arm/64/itx.S b/src/arm/64/itx.S index 98147a3..ea0de6a 100644 --- a/src/arm/64/itx.S +++ b/src/arm/64/itx.S @@ -660,7 +660,7 @@ L(itx_4x4_end): st1 {v1.s}[0], [x0], x1 st1 {v1.s}[1], [x0], x1 - br x15 + ret x15 endfunc .macro def_fn_4x4 txfm1, txfm2 @@ -897,7 +897,7 @@ function inv_txfm_\variant\()add_8x8_neon blr x5 load_add_store_8x8 x0, x7 - br x15 + ret x15 endfunc .endm @@ -962,7 +962,7 @@ function inv_txfm_add_8x4_neon blr x5 load_add_store_8x4 x0, x7 - br x15 + ret x15 endfunc function inv_txfm_add_4x8_neon @@ -988,7 +988,7 @@ function inv_txfm_add_4x8_neon blr x5 load_add_store_4x8 x0, x7 - br x15 + ret x15 endfunc .macro def_fn_48 w, h, txfm1, txfm2 @@ -1404,7 +1404,7 @@ function inv_txfm_horz\suffix\()_16x8_neon st1 {\i}, [x6], #16 .endr - br x14 + ret x14 endfunc .endm @@ -1419,7 +1419,7 @@ function inv_txfm_add_vert_8x16_neon .endr blr x5 load_add_store_8x16 x6, x7 - br x14 + ret x14 endfunc function inv_txfm_add_16x16_neon @@ -1453,7 +1453,7 @@ function inv_txfm_add_16x16_neon .endr add sp, sp, #512 - br x15 + ret x15 endfunc .macro def_fn_16x16 txfm1, txfm2, eob_half @@ -1553,7 +1553,7 @@ function inv_txfm_\variant\()add_16x4_neon add x6, x0, #8 load_add_store_8x4 x6, x7 - br x15 + ret x15 endfunc function inv_txfm_\variant\()add_4x16_neon @@ -1621,7 +1621,7 @@ function inv_txfm_\variant\()add_4x16_neon load_add_store_4x16 x0, x6 - br x15 + ret x15 endfunc .endm @@ -1731,7 +1731,7 @@ function inv_txfm_\variant\()add_16x8_neon add x0, x0, #8 load_add_store_8x8 x0, x7 - br x15 + ret x15 endfunc function inv_txfm_\variant\()add_8x16_neon @@ -1804,7 +1804,7 @@ function inv_txfm_\variant\()add_8x16_neon load_add_store_8x16 x0, x6 - br x15 + ret x15 endfunc .endm @@ -2089,7 +2089,7 @@ function inv_txfm_horz\suffix\()_dct_32x8_neon store2 v25.8h, v17.8h, \shift store2 v24.8h, v16.8h, \shift .purgem store2 - br x14 + ret x14 endfunc .endm @@ -2163,7 +2163,7 @@ function inv_txfm_add_vert_dct_8x32_neon combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 .purgem combine - br x14 + ret x14 endfunc const eob_32x32 @@ -2374,7 +2374,7 @@ function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1 .endr add sp, sp, #2048 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1 @@ -2423,7 +2423,7 @@ function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1 .endr add sp, sp, #1024 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 @@ -2468,7 +2468,7 @@ function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 .endr add sp, sp, #1024 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1 @@ -2525,7 +2525,7 @@ function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1 bl inv_txfm_add_vert_dct_8x32_neon add sp, sp, #512 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1 @@ -2559,7 +2559,7 @@ function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1 b.lt 1b add sp, sp, #512 - br x15 + ret x15 endfunc function inv_dct64_step1_neon @@ -2886,7 +2886,7 @@ function inv_txfm_dct\suffix\()_8h_x64_neon, export=1 bl inv_dct64_step2_neon - br x14 + ret x14 endfunc .endm @@ -2943,7 +2943,7 @@ function inv_txfm_horz_dct_64x8_neon cmp x7, x8 b.lt 1b - br x14 + ret x14 endfunc function inv_txfm_add_vert_dct_8x64_neon @@ -2999,7 +2999,7 @@ function inv_txfm_add_vert_dct_8x64_neon cmp x7, x8 b.lt 1b - br x14 + ret x14 endfunc function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 @@ -3053,7 +3053,7 @@ function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 .endr add sp, x5, #64*32*2 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1 @@ -3106,7 +3106,7 @@ function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1 .endr add sp, x5, #64*32*2 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 @@ -3158,7 +3158,7 @@ function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 .endr add sp, x5, #32*32*2 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 @@ -3212,7 +3212,7 @@ function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 .endr add sp, x4, #64*16*2 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 @@ -3265,5 +3265,5 @@ function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 .endr add sp, x5, #16*32*2 - br x15 + ret x15 endfunc diff --git a/src/arm/64/itx16.S b/src/arm/64/itx16.S index 5e55dd3..d1ac4bc 100644 --- a/src/arm/64/itx16.S +++ b/src/arm/64/itx16.S @@ -541,7 +541,7 @@ L(itx_4x4_end): st1 {v1.d}[0], [x0], x1 st1 {v1.d}[1], [x0], x1 - br x15 + ret x15 endfunc .macro def_fn_4x4 txfm1, txfm2 @@ -784,7 +784,7 @@ function inv_txfm_add_8x8_neon blr x5 load_add_store_8x8 x0, x7 - br x15 + ret x15 endfunc .macro def_fn_8x8 txfm1, txfm2, eob_half @@ -853,7 +853,7 @@ function inv_txfm_add_8x4_neon blr x5 load_add_store_8x4 x0, x7 - br x15 + ret x15 endfunc function inv_txfm_add_4x8_neon @@ -902,7 +902,7 @@ function inv_txfm_add_4x8_neon blr x5 load_add_store_4x8 x0, x7 - br x15 + ret x15 endfunc .macro def_fn_48 w, h, txfm1, txfm2, eob_half @@ -1282,7 +1282,7 @@ function inv_txfm_horz\suffix\()_16x4_neon st1 {\i}, [x6], #16 .endr - br x14 + ret x14 endfunc .endm @@ -1296,7 +1296,7 @@ function inv_txfm_add_vert_8x16_neon .endr blr x5 load_add_store_8x16 x6, x7 - br x14 + ret x14 endfunc function inv_txfm_add_16x16_neon @@ -1338,7 +1338,7 @@ function inv_txfm_add_16x16_neon .endr add sp, sp, #512 - br x15 + ret x15 endfunc const eob_16x16 @@ -1423,7 +1423,7 @@ function inv_txfm_add_16x4_neon add x6, x0, #16 load_add_store_8x4 x6, x7 - br x15 + ret x15 endfunc function inv_txfm_add_4x16_neon @@ -1517,7 +1517,7 @@ function inv_txfm_add_4x16_neon load_add_store_4x16 x0, x6 - br x15 + ret x15 endfunc const eob_4x16 @@ -1698,7 +1698,7 @@ function inv_txfm_add_16x8_neon ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 - br x15 + ret x15 endfunc function inv_txfm_add_8x16_neon @@ -1839,7 +1839,7 @@ function inv_txfm_add_8x16_neon ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x20 - br x15 + ret x15 endfunc const eob_8x16 @@ -2141,7 +2141,7 @@ function inv_txfm_horz\suffix\()_dct_32x4_neon store2 v29.4s, v25.4s, v21.4s, v17.4s, \shift store2 v28.4s, v24.4s, v20.4s, v16.4s, \shift .purgem store2 - br x14 + ret x14 endfunc .endm @@ -2216,7 +2216,7 @@ function inv_txfm_add_vert_dct_8x32_neon combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 .purgem combine - br x14 + ret x14 endfunc const eob_32x32 @@ -2533,7 +2533,7 @@ function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1 .endr add sp, sp, #2048 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1 @@ -2582,7 +2582,7 @@ function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1 .endr add sp, sp, #1024 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 @@ -2632,7 +2632,7 @@ function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 .endr add sp, sp, #1024 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1 @@ -2692,7 +2692,7 @@ function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1 bl inv_txfm_add_vert_dct_8x32_neon add sp, sp, #512 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1 @@ -2743,7 +2743,7 @@ function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1 b.lt 1b add sp, sp, #512 - br x15 + ret x15 endfunc function inv_dct64_step1_neon @@ -3070,7 +3070,7 @@ function inv_txfm_dct\suffix\()_4s_x64_neon bl inv_dct64_step2_neon - br x14 + ret x14 endfunc .endm @@ -3127,7 +3127,7 @@ function inv_txfm_horz_dct_64x4_neon cmp x7, x8 b.lt 1b - br x14 + ret x14 endfunc function inv_txfm_add_vert_dct_8x64_neon @@ -3184,7 +3184,7 @@ function inv_txfm_add_vert_dct_8x64_neon cmp x7, x8 b.lt 1b - br x14 + ret x14 endfunc function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 @@ -3238,7 +3238,7 @@ function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 .endr add sp, x5, #64*32*2 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1 @@ -3291,7 +3291,7 @@ function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1 .endr add sp, x5, #64*32*2 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1 @@ -3341,7 +3341,7 @@ function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1 .endr add sp, x5, #32*32*2 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1 @@ -3395,7 +3395,7 @@ function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1 .endr add sp, x4, #64*16*2 - br x15 + ret x15 endfunc function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 @@ -3448,5 +3448,5 @@ function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 .endr add sp, x5, #16*32*2 - br x15 + ret x15 endfunc diff --git a/src/arm/64/loopfilter.S b/src/arm/64/loopfilter.S index 1d426f1..2b9b5c4 100644 --- a/src/arm/64/loopfilter.S +++ b/src/arm/64/loopfilter.S @@ -478,16 +478,16 @@ function lpf_16_wd\wd\()_neon .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels - br x13 + ret x13 .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels - br x14 + ret x14 .endif 9: // Return directly without writing back any pixels - br x15 + ret x15 endfunc .endm @@ -532,7 +532,7 @@ function lpf_v_4_16_neon st1 {v23.16b}, [x16], x1 // p0 st1 {v25.16b}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 - br x15 + ret x15 endfunc function lpf_h_4_16_neon @@ -583,7 +583,7 @@ function lpf_h_4_16_neon st1 {v25.s}[1], [x16], x1 st1 {v25.s}[3], [x0], x1 add x0, x0, #2 - br x15 + ret x15 endfunc function lpf_v_6_16_neon @@ -607,7 +607,7 @@ function lpf_v_6_16_neon st1 {v23.16b}, [x16], x1 // p0 st1 {v25.16b}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 - br x15 + ret x15 endfunc function lpf_h_6_16_neon @@ -658,7 +658,7 @@ function lpf_h_6_16_neon st1 {v25.s}[1], [x16], x1 st1 {v25.s}[3], [x0], x1 add x0, x0, #2 - br x15 + ret x15 endfunc function lpf_v_8_16_neon @@ -686,7 +686,7 @@ function lpf_v_8_16_neon st1 {v26.16b}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 - br x15 + ret x15 8: sub x16, x0, x1, lsl #1 @@ -695,7 +695,7 @@ function lpf_v_8_16_neon st1 {v23.16b}, [x16], x1 // p0 st1 {v25.16b}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 - br x15 + ret x15 endfunc function lpf_h_8_16_neon @@ -746,7 +746,7 @@ function lpf_h_8_16_neon st1 {v27.d}[0], [x16], x1 st1 {v27.d}[1], [x0], x1 add x0, x0, #4 - br x15 + ret x15 8: sub x16, x0, x1, lsl #4 sub x16, x16, #2 @@ -770,7 +770,7 @@ function lpf_h_8_16_neon st1 {v25.s}[1], [x16], x1 st1 {v25.s}[3], [x0], x1 add x0, x0, #2 - br x15 + ret x15 endfunc function lpf_v_16_16_neon @@ -813,7 +813,7 @@ function lpf_v_16_16_neon st1 {v11.16b}, [x0], x1 // q5 sub x0, x0, x1, lsl #2 sub x0, x0, x1, lsl #1 - br x15 + ret x15 7: sub x16, x0, x1 sub x16, x16, x1, lsl #1 @@ -825,7 +825,7 @@ function lpf_v_16_16_neon st1 {v26.16b}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 - br x15 + ret x15 8: sub x16, x0, x1, lsl #1 @@ -834,7 +834,7 @@ function lpf_v_16_16_neon st1 {v23.16b}, [x16], x1 // p0 st1 {v25.16b}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 - br x15 + ret x15 endfunc function lpf_h_16_16_neon @@ -916,7 +916,7 @@ function lpf_h_16_16_neon st1 {v30.d}[1], [x0], x1 st1 {v5.d}[1], [x16], x1 st1 {v31.d}[1], [x0], x1 - br x15 + ret x15 7: sub x16, x0, x1, lsl #4 @@ -941,7 +941,7 @@ function lpf_h_16_16_neon st1 {v27.d}[0], [x16], x1 st1 {v27.d}[1], [x0], x1 add x0, x0, #4 - br x15 + ret x15 8: sub x16, x0, x1, lsl #4 sub x16, x16, #2 @@ -965,7 +965,7 @@ function lpf_h_16_16_neon st1 {v25.s}[1], [x16], x1 st1 {v25.s}[3], [x0], x1 add x0, x0, #2 - br x15 + ret x15 endfunc // void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride, @@ -1096,7 +1096,7 @@ function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1 ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 - br x11 + ret x11 endfunc .endm diff --git a/src/arm/64/loopfilter16.S b/src/arm/64/loopfilter16.S index 8c9f98b..aab0230 100644 --- a/src/arm/64/loopfilter16.S +++ b/src/arm/64/loopfilter16.S @@ -364,16 +364,16 @@ function lpf_8_wd\wd\()_neon .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels - br x13 + ret x13 .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels - br x14 + ret x14 .endif 9: // Return directly without writing back any pixels - br x15 + ret x15 endfunc .endm @@ -418,7 +418,7 @@ function lpf_v_4_8_neon st1 {v23.8h}, [x16], x1 // p0 st1 {v25.8h}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 - br x15 + ret x15 endfunc function lpf_h_4_8_neon @@ -453,7 +453,7 @@ function lpf_h_4_8_neon st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 add x0, x0, #4 - br x15 + ret x15 endfunc function lpf_v_6_8_neon @@ -477,7 +477,7 @@ function lpf_v_6_8_neon st1 {v23.8h}, [x16], x1 // p0 st1 {v25.8h}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 - br x15 + ret x15 endfunc function lpf_h_6_8_neon @@ -512,7 +512,7 @@ function lpf_h_6_8_neon st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 add x0, x0, #4 - br x15 + ret x15 endfunc function lpf_v_8_8_neon @@ -540,7 +540,7 @@ function lpf_v_8_8_neon st1 {v26.8h}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 - br x15 + ret x15 8: sub x16, x0, x1, lsl #1 @@ -549,7 +549,7 @@ function lpf_v_8_8_neon st1 {v23.8h}, [x16], x1 // p0 st1 {v25.8h}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 - br x15 + ret x15 endfunc function lpf_h_8_8_neon @@ -584,7 +584,7 @@ function lpf_h_8_8_neon st1 {v23.8h}, [x16], x1 st1 {v27.8h}, [x0], x1 add x0, x0, #8 - br x15 + ret x15 8: sub x16, x0, x1, lsl #3 sub x16, x16, #4 @@ -600,7 +600,7 @@ function lpf_h_8_8_neon st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 add x0, x0, #4 - br x15 + ret x15 endfunc function lpf_v_16_8_neon @@ -643,7 +643,7 @@ function lpf_v_16_8_neon st1 {v11.8h}, [x0], x1 // q5 sub x0, x0, x1, lsl #2 sub x0, x0, x1, lsl #1 - br x15 + ret x15 7: sub x16, x0, x1 sub x16, x16, x1, lsl #1 @@ -655,7 +655,7 @@ function lpf_v_16_8_neon st1 {v26.8h}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 - br x15 + ret x15 8: sub x16, x0, x1, lsl #1 @@ -664,7 +664,7 @@ function lpf_v_16_8_neon st1 {v23.8h}, [x16], x1 // p0 st1 {v25.8h}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 - br x15 + ret x15 endfunc function lpf_h_16_8_neon @@ -714,7 +714,7 @@ function lpf_h_16_8_neon st1 {v30.8h}, [x0], x1 st1 {v5.8h}, [x16], x1 st1 {v31.8h}, [x0], x1 - br x15 + ret x15 7: sub x16, x0, x1, lsl #3 @@ -731,7 +731,7 @@ function lpf_h_16_8_neon st1 {v23.8h}, [x16], x1 st1 {v27.8h}, [x0], x1 add x0, x0, #8 - br x15 + ret x15 8: sub x16, x0, x1, lsl #3 sub x16, x16, #4 @@ -747,7 +747,7 @@ function lpf_h_16_8_neon st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 add x0, x0, #4 - br x15 + ret x15 endfunc // void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride, @@ -892,7 +892,7 @@ function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1 ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 - br x11 + ret x11 endfunc .endm diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index ef7f23b..8a80b39 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -1979,7 +1979,7 @@ L(\type\()_8tap_hv): b 28b 0: - br x15 + ret x15 L(\type\()_8tap_filter_2): ld1 {v28.8b}, [\sr2], \s_strd @@ -2135,7 +2135,7 @@ L(\type\()_8tap_filter_2): mov v22.8b, v29.8b b 48b 0: - br x15 + ret x15 L(\type\()_8tap_filter_4): ld1 {v26.8b}, [\sr2], \s_strd @@ -2343,7 +2343,7 @@ L(\type\()_8tap_filter_4): .endif b 168b 0: - br x15 + ret x15 L(\type\()_8tap_filter_8_first): ld1 {v28.8b, v29.8b}, [\src], \s_strd @@ -3072,7 +3072,7 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1 add w6, w6, w4 b.gt 1b - br x15 + ret x15 endfunc .endm diff --git a/src/arm/64/mc16.S b/src/arm/64/mc16.S index cec82a3..05bfa39 100644 --- a/src/arm/64/mc16.S +++ b/src/arm/64/mc16.S @@ -2139,7 +2139,7 @@ L(\type\()_8tap_hv): b 28b 0: - br x15 + ret x15 L(\type\()_8tap_filter_2): ld1 {v25.8h}, [\sr2], \s_strd @@ -2304,7 +2304,7 @@ L(\type\()_8tap_filter_2): mov v22.8b, v25.8b b 48b 0: - br x15 + ret x15 L(\type\()_8tap_filter_4): ld1 {v24.8h}, [\sr2], \s_strd @@ -2554,7 +2554,7 @@ L(\type\()_8tap_filter_4): add \dst, \dst, #16 b 168b 0: - br x15 + ret x15 L(\type\()_8tap_filter_8): ld1 {v4.8h, v5.8h}, [\sr2], \s_strd @@ -3398,7 +3398,7 @@ function warp_affine_8x8\t\()_16bpc_neon, export=1 ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 - br x15 + ret x15 endfunc .endm |