diff options
author | Fadi Hanna <fadim@microsoft.com> | 2018-07-05 19:23:52 +0300 |
---|---|---|
committer | Fadi Hanna <fadim@microsoft.com> | 2018-07-05 19:23:52 +0300 |
commit | 5c54131e87a5a9b52b16697b3459f2249d32146e (patch) | |
tree | c275b50b6ea7ff77c6389f1033f5fd9608c351a1 /src/Native | |
parent | 303df5960283a70111bd5e09fcd5e1792a502a36 (diff) |
Fixing a perf issue discovered in the arm64 assembly stubs: using the BR instruction for tail calls, instead of the RET instruction.
These changes should improve the overall performance on ARM64. An example of measured perf gains: 362% improvement for interface calls on cached cells. Here's the test output for 100000000 iterations:
BEFORE the changes:
=======================
"LowLevelPerf.exe" -name INTERFACE_INTERFACE_METHOD -iters 100000000
INTERFACE_INTERFACE_METHOD
Dynamic Timer = 425064.815690 Iters/Sec
Seconds = 235.258239 Seconds
Process Cycles = 565810805623.000000 Cycles
Process Cycles/Iter = 5658.108056 Cycles/Iters
AFTER the changes:
=======================
"LowLevelPerf.exe" -name INTERFACE_INTERFACE_METHOD -iters 100000000
INTERFACE_INTERFACE_METHOD
Dynamic Timer = 1531103.010283 Iters/Sec
Seconds = 65.312392 Seconds
Process Cycles = 156754061586.000000 Cycles
Process Cycles/Iter = 1567.540616 Cycles/Iters
[tfs-changeset: 1706580]
Diffstat (limited to 'src/Native')
-rw-r--r-- | src/Native/Runtime/arm64/CallingConventionConverterHelpers.asm | 2 | ||||
-rw-r--r-- | src/Native/Runtime/arm64/ExceptionHandling.asm | 2 | ||||
-rw-r--r-- | src/Native/Runtime/arm64/GcProbe.asm | 6 | ||||
-rw-r--r-- | src/Native/Runtime/arm64/InteropThunksHelpers.asm | 2 | ||||
-rw-r--r-- | src/Native/Runtime/arm64/MiscStubs.asm | 2 | ||||
-rw-r--r-- | src/Native/Runtime/arm64/StubDispatch.asm | 8 | ||||
-rw-r--r-- | src/Native/Runtime/arm64/ThunkPoolThunks.asm | 2 | ||||
-rw-r--r-- | src/Native/Runtime/arm64/UniversalTransition.asm | 2 |
8 files changed, 13 insertions, 13 deletions
diff --git a/src/Native/Runtime/arm64/CallingConventionConverterHelpers.asm b/src/Native/Runtime/arm64/CallingConventionConverterHelpers.asm index 2d43d5bba..f60e11578 100644 --- a/src/Native/Runtime/arm64/CallingConventionConverterHelpers.asm +++ b/src/Native/Runtime/arm64/CallingConventionConverterHelpers.asm @@ -46,7 +46,7 @@ POINTER_SIZE equ 0x08 ldr xip0, [xip0, #POINTER_SIZE] ; get pointer to CallingConventionConverter_CommonCallingStub_PointerData into xip0 ldr x12, [xip0, #POINTER_SIZE] ; get address of UniversalTransitionThunk (which we'll tailcall to later) ldr xip0, [xip0] ; get address of ManagedCallConverterThunk (target for universal thunk to call) - ret x12 + br x12 LEAF_END __jmpstub__CallingConventionConverter_CommonCallingStub ;; diff --git a/src/Native/Runtime/arm64/ExceptionHandling.asm b/src/Native/Runtime/arm64/ExceptionHandling.asm index c71194da9..35843afee 100644 --- a/src/Native/Runtime/arm64/ExceptionHandling.asm +++ b/src/Native/Runtime/arm64/ExceptionHandling.asm @@ -489,7 +489,7 @@ DonePopping NoAbort ;; reset SP and jump to continuation address mov sp, x2 - ret x0 + br x0 NESTED_END RhpCallCatchFunclet diff --git a/src/Native/Runtime/arm64/GcProbe.asm b/src/Native/Runtime/arm64/GcProbe.asm index caa400ce4..73d674a5c 100644 --- a/src/Native/Runtime/arm64/GcProbe.asm +++ b/src/Native/Runtime/arm64/GcProbe.asm @@ -566,7 +566,7 @@ EXTRA_SAVE_SIZE equ (32*16) EPILOG_RESTORE_REG_PAIR x27, x28, #0x60 EPILOG_NOP ldr x9, [sp, #0x78] EPILOG_RESTORE_REG_PAIR fp, lr, #(SIZEOF__PAL_LIMITED_CONTEXT + 0x20)! - EPILOG_NOP ret x9 + EPILOG_NOP br x9 NESTED_END RhpHijackForGcStressLeaf @@ -675,7 +675,7 @@ EXTRA_SAVE_SIZE equ (32*16) ldr x2, [sp, #PROBE_FRAME_SIZE] FREE_PROBE_FRAME 0x10, {true} ; This restores exception object back into x0 EPILOG_NOP mov x1, x0 ; Move the Exception object back into x1 where the catch handler expects it - EPILOG_NOP ret x2 + EPILOG_NOP br x2 MEND ;; @@ -847,7 +847,7 @@ DoneWaitingForGc EPILOG_NOP ldr x1, [sp, #8] ; hijack target address EPILOG_STACK_FREE 0x10 - EPILOG_NOP ret x1 ; jump to the hijack target + EPILOG_NOP br x1 ; jump to the hijack target Abort FREE_LOOP_HIJACK_FRAME diff --git a/src/Native/Runtime/arm64/InteropThunksHelpers.asm b/src/Native/Runtime/arm64/InteropThunksHelpers.asm index 88bb7da5d..cc8489ef1 100644 --- a/src/Native/Runtime/arm64/InteropThunksHelpers.asm +++ b/src/Native/Runtime/arm64/InteropThunksHelpers.asm @@ -58,7 +58,7 @@ __SECTIONREL_ThunkParamSlot ;; Now load the target address and jump to it. ldr xip0, [xip0, #POINTER_SIZE] - ret xip0 + br xip0 LEAF_END RhCommonStub diff --git a/src/Native/Runtime/arm64/MiscStubs.asm b/src/Native/Runtime/arm64/MiscStubs.asm index ba18a93c2..99c05e4a3 100644 --- a/src/Native/Runtime/arm64/MiscStubs.asm +++ b/src/Native/Runtime/arm64/MiscStubs.asm @@ -96,7 +96,7 @@ RhpCheckCctor__SlowPath EPILOG_RESTORE_REG_PAIR fp, lr, #0x20! ;; tail-call the class lib cctor check function. This function is required to return its first ;; argument, so that x0 can be preserved. - EPILOG_NOP ret x12 + EPILOG_NOP br x12 NESTED_END RhpCheckCctor__SlowPath2 diff --git a/src/Native/Runtime/arm64/StubDispatch.asm b/src/Native/Runtime/arm64/StubDispatch.asm index 82a4f861e..8f3b7488c 100644 --- a/src/Native/Runtime/arm64/StubDispatch.asm +++ b/src/Native/Runtime/arm64/StubDispatch.asm @@ -55,7 +55,7 @@ SECTIONREL_t_TLS_DispatchCell cmp x10, x11 bne %ft0 ;; Jump to label '0' ldr x9, [x9, #(OFFSETOF__InterfaceDispatchCache__m_rgEntries + ($entry * 16) + 8)] - ret x9 + br x9 0 ;; Label '0' MEND @@ -70,7 +70,7 @@ SECTIONREL_t_TLS_DispatchCell ;; Now load the target address and jump to it. ldr x9, [xip0, #8] - ret x9 + br x9 LEAF_END RhpCastableObjectDispatch_CommonStub LEAF_ENTRY RhpTailCallTLSDispatchCell @@ -79,7 +79,7 @@ SECTIONREL_t_TLS_DispatchCell ;; Tail call to the target of the dispatch cell, preserving the cell address in xip1 ldr x9, [xip1] - ret x9 + br x9 LEAF_END RhpTailCallTLSDispatchCell LEAF_ENTRY RhpCastableObjectDispatchHelper_TailCalled @@ -168,7 +168,7 @@ CurrentEntry SETA CurrentEntry + 1 ;; Load the target address of the vtable into x12 ldr x12, [x12] - ret x12 + br x12 LEAF_END RhpVTableOffsetDispatch ;; diff --git a/src/Native/Runtime/arm64/ThunkPoolThunks.asm b/src/Native/Runtime/arm64/ThunkPoolThunks.asm index ddd2636db..6d9a02ff6 100644 --- a/src/Native/Runtime/arm64/ThunkPoolThunks.asm +++ b/src/Native/Runtime/arm64/ThunkPoolThunks.asm @@ -42,7 +42,7 @@ RO$name % 8 ;; fix offset to point to last QWROD in page : xip1 <- [xip0 + PAGE_SIZE - POINTER_SIZE] ;; tailcall to the location pointed at by the last qword in the data page ldr xip1, [xip0, #(PAGE_SIZE - POINTER_SIZE - ($groupIndex * THUNK_DATASIZE * 10 + THUNK_DATASIZE * $index))] - ret xip1 + br xip1 brk 0xf000 ;; Stubs need to be 16-byte aligned (see comment above). Filling padding with a ;; deterministic brk instruction, instead of having it just filled with zeros. diff --git a/src/Native/Runtime/arm64/UniversalTransition.asm b/src/Native/Runtime/arm64/UniversalTransition.asm index fe4763740..dc699ebed 100644 --- a/src/Native/Runtime/arm64/UniversalTransition.asm +++ b/src/Native/Runtime/arm64/UniversalTransition.asm @@ -147,7 +147,7 @@ EPILOG_RESTORE_REG_PAIR fp, lr, #STACK_SIZE! ;; Tailcall to the target address. - EPILOG_NOP ret x12 + EPILOG_NOP br x12 NESTED_END Rhp$FunctionName |