Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mono/corert.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordotnet-bot <dotnet-bot@microsoft.com>2017-08-08 01:26:23 +0300
committerdotnet-bot <dotnet-bot@microsoft.com>2017-08-08 01:26:23 +0300
commit1bc2b144bd978d441c6afff399e33081edf41f18 (patch)
tree1d5dbd674d725956edfb7843e54c279991001925 /src/Native/Runtime/amd64
parent980f02121ea435fd5610deafd6248376f32bd246 (diff)
ProjectX: Loop Hijack for GC poll
This change enables hijacking loops for GC poll. A loop needs a GC poll if it's not proved to have other GC hijack points, such as function calls. Unlike the current table-based approach adopted in ProjectN, a piece of hijack code is injected to the loop to lead the control flow to the runtime for GC. For example, a loop usually has a conditional or unconditional branch at each of its back edge. $LN10: // loop body test eax, eax jne $LN10, With this change the branch code will be changed to cmp LoopHijackFlag, 0 jne $LNStub $LNsource: test eax, eax jne $LN10 LNStub: (at the end of function) call RhpLoopHijackX jmp $LNsource; A compare-and-jump is injected directly to the loop body while the hijacking code is placed at the end of the function since it's considered "cold code" assuming GC does not happened very often. The global variable LoopHijackFlag is per module since we don't want to occupy or trash a register for it inside the loop. The runtime helper RhpLoopHijackX is designed the preserve every register including callee-save registers and scratch registers so that no register used in the loop is trashed. The transform happens very late in UTC in order not to affect the register allocation and to catch lower-introduced loops. The change only works for Od mode for now. In Ox mode, a liveness analysis of up-exposed CC flag register use is needed so that the LoopHijackFlag check doesn't trash the CC register. Testing: The Nutc\BugFixes\BusyWait.proj test which is specifically designed to test this feature passed No regression to other ToF tests [tfs-changeset: 1669432]
Diffstat (limited to 'src/Native/Runtime/amd64')
-rw-r--r--src/Native/Runtime/amd64/GcProbe.asm270
1 files changed, 270 insertions, 0 deletions
diff --git a/src/Native/Runtime/amd64/GcProbe.asm b/src/Native/Runtime/amd64/GcProbe.asm
index f6064f8a4..0b91f4546 100644
--- a/src/Native/Runtime/amd64/GcProbe.asm
+++ b/src/Native/Runtime/amd64/GcProbe.asm
@@ -549,6 +549,11 @@ EXTERN g_fHasFastFxsave : BYTE
FXSAVE_SIZE equ 512
+;; Trap a loop to GC.
+;; Set up the P/Invoke transition frame with the original loop target as the safe point.
+;; All registers, both volatile and non-volatile, are preserved.
+;; Input: ModuleHeader, chunk starting index and chunk sub-index which are used to get the original loop target
+;; The function is not called but jumped directly
NESTED_ENTRY RhpLoopHijack, _TEXT
sizeof_OutgoingScratchSpace equ 20h
@@ -825,6 +830,271 @@ DontRestoreXmmAgain:
NESTED_END RhpLoopHijack, _TEXT
+;; Trap to GC.
+;; Set up the P/Invoke transition frame with the return address as the safe point.
+;; All registers, both volatile and non-volatile, are preserved.
+;; The function should be called not jumped because it's expecting the return address
+NESTED_ENTRY RhpTrapToGC, _TEXT
+
+ sizeof_OutgoingScratchSpace equ 20h
+ sizeof_PInvokeFrame equ OFFSETOF__PInvokeTransitionFrame__m_PreservedRegs + 15*8
+ sizeof_XmmAlignPad equ 8
+ sizeof_XmmSave equ FXSAVE_SIZE
+ sizeof_MachineFrame equ 6*8
+ sizeof_InitialPushedArgs equ 3*8 ;; eflags, rcx, return value
+ sizeof_FixedFrame equ sizeof_OutgoingScratchSpace + sizeof_PInvokeFrame + sizeof_XmmAlignPad + sizeof_XmmSave + sizeof_MachineFrame
+
+ ;; On the stack on entry:
+ ;; [rsp ] -> Return address
+
+ ;; Prepare for our return by stashing a scratch register where we can pop it just before returning
+ ;; The scratch register will be used as PSP in the epilog
+ push rcx
+
+ ;; save eflags before we trash them
+ pushfq
+
+ ;; What we want to get to:
+ ;;
+ ;; [rsp ] -> outgoing scratch area
+ ;;
+ ;; [rsp + 20] -> m_RIP -------|
+ ;; [rsp + 28] -> m_FramePointer |
+ ;; [rsp + 30] -> m_pThread |
+ ;; [rsp + 38] -> m_dwFlags / m_dwAlignPad2 |
+ ;; [rsp + 40] -> rbx save |
+ ;; [rsp + 48] -> rsi save |
+ ;; [rsp + 50] -> rdi save |
+ ;; [rsp + 58] -> r12 save |
+ ;; [rsp + 60] -> r13 save |
+ ;; [rsp + 68] -> r14 save | PInvokeTransitionFrame
+ ;; [rsp + 70] -> r15 save |
+ ;; [rsp + 78] -> rsp save |
+ ;; [rsp + 80] -> rax save |
+ ;; [rsp + 88] -> rcx save |
+ ;; [rsp + 90] -> rdx save |
+ ;; [rsp + 98] -> r8 save |
+ ;; [rsp + a0] -> r9 save |
+ ;; [rsp + a8] -> r10 save |
+ ;; [rsp + b0] -> r11 save -------|
+ ;;
+ ;; [rsp + b8] -> [XmmAlignPad]
+ ;;
+ ;; [rsp + c0] -> FXSAVE area
+ ;;
+ ;; [rsp +2c0] | RIP |
+ ;; [rsp +2c8] | CS |
+ ;; [rsp +2d0] | EFLAGS | <-- 'machine frame'
+ ;; [rsp +2d8] | RSP |
+ ;; [rsp +2e0] | SS |
+ ;; [rsp +2e8] | padding |
+ ;;
+ ;; [rsp +2f0] [optional stack alignment]
+ ;;
+ ;; [PSP - 18] -> eflags save
+ ;; [PSP - 10] -> rcx save
+ ;; [PSP - 8] -> Return address
+ ;; [PSP] -> caller's frame
+
+ test rsp, 0Fh
+ jz AlreadyAligned
+
+ sub rsp, sizeof_XmmAlignPad + sizeof_XmmSave + sizeof_MachineFrame + 8 ; +8 to align RSP
+ push r11 ; save incoming R11 into save location
+ lea r11, [rsp + 8 + sizeof_XmmAlignPad + sizeof_XmmSave + sizeof_MachineFrame + 8 + sizeof_InitialPushedArgs]
+ jmp PspCalculated
+
+ AlreadyAligned:
+ sub rsp, sizeof_XmmAlignPad + sizeof_XmmSave + sizeof_MachineFrame
+ push r11 ; save incoming R11 into save location
+ lea r11, [rsp + 8 + sizeof_XmmAlignPad + sizeof_XmmSave + sizeof_MachineFrame + sizeof_InitialPushedArgs]
+
+ PspCalculated:
+ push r10 ; save incoming R10 into save location
+ xor r10d, r10d
+
+ ;;
+ ;; Populate the 'machine frame' in the diagram above. We have only pushed up to the 'r10 save', so we have not
+ ;; yet pushed 0xA8 bytes of that diagram.
+ ;;
+ ;; [rsp + {offset-in-target-frame-layout-diagram} - {as-yet-unpushed-stack-size}]
+ mov [rsp + 2c0h - 0a8h], r10 ; init RIP to zero
+ mov [rsp + 2c8h - 0a8h], r10 ; init CS to zero
+ mov [rsp + 2d0h - 0a8h], r10 ; init EFLAGS to zero
+ mov [rsp + 2d8h - 0a8h], r11 ; save PSP in the 'machine frame'
+ mov [rsp + 2e0h - 0a8h], r10 ; init SS to zero
+
+ .pushframe
+ .allocstack sizeof_XmmAlignPad + sizeof_XmmSave + 2*8 ;; only 2 of the regs from the PInvokeTransitionFrame are on the stack
+
+ push_vol_reg r9
+ push_vol_reg r8
+ push_vol_reg rdx
+ push_vol_reg rcx
+ push_vol_reg rax
+ push_vol_reg r11 ; PSP gets saved into the PInvokeTransitionFrame
+ push_nonvol_reg r15
+ push_nonvol_reg r14
+ push_nonvol_reg r13
+ push_nonvol_reg r12
+ push_nonvol_reg rdi
+ push_nonvol_reg rsi
+ push_nonvol_reg rbx
+ push_vol_reg PROBE_SAVE_FLAGS_EVERYTHING ; m_dwFlags / m_dwAlignPad2
+
+ ;; rdx <- GetThread(), TRASHES rcx
+ INLINE_GETTHREAD rdx, rcx
+
+ push_vol_reg rdx ; m_pThread
+ push_nonvol_reg rbp ; m_FramePointer
+ push_vol_reg r10 ; m_RIP
+
+ alloc_stack sizeof_OutgoingScratchSpace
+ END_PROLOGUE
+
+ mov rbx, r11 ; put PSP into RBX
+ mov rsi, rdx ; put Thread* into RSI
+
+ ; RBX is PSP
+ ; RSI is Thread*
+
+ fxsave [rsp + 0c0h]
+
+ cmp [g_fHasFastFxsave], 0 ; fast fxsave won't save the xmm registers, so we must do it
+ jz DontSaveXmmAgain
+
+ ;; 0C0h -> offset of FXSAVE area
+ ;; 0A0h -> offset of xmm0 save area within the FXSAVE area
+ movdqa [rsp + 0c0h + 0a0h + 0*10h], xmm0
+ movdqa [rsp + 0c0h + 0a0h + 1*10h], xmm1
+ movdqa [rsp + 0c0h + 0a0h + 2*10h], xmm2
+ movdqa [rsp + 0c0h + 0a0h + 3*10h], xmm3
+ movdqa [rsp + 0c0h + 0a0h + 4*10h], xmm4
+ movdqa [rsp + 0c0h + 0a0h + 5*10h], xmm5
+ movdqa [rsp + 0c0h + 0a0h + 6*10h], xmm6
+ movdqa [rsp + 0c0h + 0a0h + 7*10h], xmm7
+ movdqa [rsp + 0c0h + 0a0h + 8*10h], xmm8
+ movdqa [rsp + 0c0h + 0a0h + 9*10h], xmm9
+ movdqa [rsp + 0c0h + 0a0h + 10*10h], xmm10
+ movdqa [rsp + 0c0h + 0a0h + 11*10h], xmm11
+ movdqa [rsp + 0c0h + 0a0h + 12*10h], xmm12
+ movdqa [rsp + 0c0h + 0a0h + 13*10h], xmm13
+ movdqa [rsp + 0c0h + 0a0h + 14*10h], xmm14
+ movdqa [rsp + 0c0h + 0a0h + 15*10h], xmm15
+
+DontSaveXmmAgain:
+ mov rax, [rbx - 8]
+ mov [rsp + 2c0h], rax ; save return address into 'machine frame'
+ mov [rsp + 20h], rax ; save return address into PInvokeTransitionFrame
+
+ ; Early out if GC stress is currently suppressed. Do this after we have computed the real address to
+ ; return to but before we link the transition frame onto m_pHackPInvokeTunnel (because hitting this
+ ; condition implies we're running restricted callouts during a GC itself and we could end up
+ ; overwriting a co-op frame set by the code that caused the GC in the first place, e.g. a GC.Collect
+ ; call).
+ test dword ptr [rsi + OFFSETOF__Thread__m_ThreadStateFlags], TSF_SuppressGcStress + TSF_DoNotTriggerGc
+ jnz DoneWaitingForGc
+
+ ; link the frame into the Thread
+ lea rcx, [rsp + sizeof_OutgoingScratchSpace] ; rcx <- PInvokeTransitionFrame*
+ mov [rsi + OFFSETOF__Thread__m_pHackPInvokeTunnel], rcx
+
+ ;;
+ ;; Unhijack this thread, if necessary.
+ ;;
+ INLINE_THREAD_UNHIJACK rsi, rax, rcx ;; trashes RAX, RCX
+
+ifdef FEATURE_GC_STRESS
+ xor eax, eax
+ cmp [g_fGcStressStarted], eax
+ jz @F
+
+ mov rdx, [rsp + 2c0h]
+ mov rcx, [g_pTheRuntimeInstance]
+ call RuntimeInstance__ShouldHijackLoopForGcStress
+ cmp al, 0
+ je @F
+
+ call REDHAWKGCINTERFACE__STRESSGC
+@@:
+endif ;; FEATURE_GC_STRESS
+
+ lea rcx, [rsp + sizeof_OutgoingScratchSpace] ; calculate PInvokeTransitionFrame pointer
+ call RhpWaitForGCNoAbort
+
+ DoneWaitingForGc:
+ mov rcx, rbx ; RCX <- PSP
+
+ fxrstor [rsp + 0c0h]
+
+ cmp [g_fHasFastFxsave], 0
+ jz DontRestoreXmmAgain
+
+ movdqa xmm0 , [rsp + 0c0h + 0a0h + 0*10h]
+ movdqa xmm1 , [rsp + 0c0h + 0a0h + 1*10h]
+ movdqa xmm2 , [rsp + 0c0h + 0a0h + 2*10h]
+ movdqa xmm3 , [rsp + 0c0h + 0a0h + 3*10h]
+ movdqa xmm4 , [rsp + 0c0h + 0a0h + 4*10h]
+ movdqa xmm5 , [rsp + 0c0h + 0a0h + 5*10h]
+ movdqa xmm6 , [rsp + 0c0h + 0a0h + 6*10h]
+ movdqa xmm7 , [rsp + 0c0h + 0a0h + 7*10h]
+ movdqa xmm8 , [rsp + 0c0h + 0a0h + 8*10h]
+ movdqa xmm9 , [rsp + 0c0h + 0a0h + 9*10h]
+ movdqa xmm10, [rsp + 0c0h + 0a0h + 10*10h]
+ movdqa xmm11, [rsp + 0c0h + 0a0h + 11*10h]
+ movdqa xmm12, [rsp + 0c0h + 0a0h + 12*10h]
+ movdqa xmm13, [rsp + 0c0h + 0a0h + 13*10h]
+ movdqa xmm14, [rsp + 0c0h + 0a0h + 14*10h]
+ movdqa xmm15, [rsp + 0c0h + 0a0h + 15*10h]
+
+DontRestoreXmmAgain:
+ add rsp, sizeof_OutgoingScratchSpace
+ mov eax, [rsp + OFFSETOF__PInvokeTransitionFrame__m_dwFlags]
+ test eax, PTFF_THREAD_ABORT
+ pop rax ; m_RIP
+ pop rbp ; m_FramePointer
+ pop rax ; m_pThread
+ pop rax ; m_dwFlags / m_dwAlign2
+ pop rbx
+ pop rsi
+ pop rdi
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ pop rax ; RSP
+ pop rax ; RAX save
+ pop rdx ; RCX save (intentionally discarding it)
+ pop rdx
+ pop r8
+ pop r9
+ pop r10
+ pop r11
+
+
+ ;; RCX is PSP at this point and the stack looks like this:
+ ;; [PSP - 18] -> eflags save
+ ;; [PSP - 10] -> rcx save
+ ;; [PSP - 8] -> return address
+ ;; [PSP] -> caller's frame
+ ;;
+ ;; The final step is to restore eflags, rcx, and return back to the loop target location.
+
+ lea rsp, [rcx - 18h]
+ jz @f ;; result of the test instruction before the pops above
+ popfq ;; restore flags
+ pop rcx ;; restore rcx
+ mov rcx, STATUS_REDHAWK_THREAD_ABORT
+ pop rdx ;; return address as exception RIP
+ jmp RhpThrowHwEx ;; Throw the ThreadAbortException as a special kind of hardware exception
+
+@@:
+ popfq ;; restore flags
+ pop rcx ;; restore rcx
+ ret
+
+NESTED_END RhpTrapToGC, _TEXT
+
ifdef FEATURE_GC_STRESS
;;
;; INVARIANT: Don't trash the argument registers, the binder codegen depends on this.