Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mono/corert.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichal Strehovsky <michals@microsoft.com>2017-08-14 20:59:13 +0300
committerMichal Strehovsky <michals@microsoft.com>2017-08-14 20:59:13 +0300
commit1e9054a91177eed92094790ae063a4d741185aa2 (patch)
tree2b37c7212aed534b66c9843b8cf2b86c0c24925b /src/Native/Runtime/arm
parentd714e3be0bb99c0f76f3b62ce310878be0b3f66e (diff)
Speed up string allocations by 35%
`FastAllocateString` (the choke point through which all string allocations go through) wasn't as fast as it could be and we were 30% slower than CLR on allocating strings. We were leaving a lot of perf on the table. Before this change, string allocation was using the same allocator as arrays. Since there's a subtle difference between the failure modes on overflow (string allocation throws OOM, array allocation throws OverflowException), `FastAllocateString` required a try/catch block to handle the corner case. This was inhibiting codegen optimizations around this code path - to fix that problem, we needed a separate allocator. And since we now had a separate allocator for strings, I also took the liberty of inlining some details around strings (component size and base size) into the helper. It turns out runtime already hardcodes the details around strings (the component size) in a couple places anyway, so this is not that big of a "separation of concerns" violation as it looks like. [tfs-changeset: 1670224]
Diffstat (limited to 'src/Native/Runtime/arm')
-rw-r--r--src/Native/Runtime/arm/AllocFast.S94
-rw-r--r--src/Native/Runtime/arm/AllocFast.asm67
-rw-r--r--src/Native/Runtime/arm/AsmMacros.h7
3 files changed, 158 insertions, 10 deletions
diff --git a/src/Native/Runtime/arm/AllocFast.S b/src/Native/Runtime/arm/AllocFast.S
index eb21bfed0..3f76d4dc8 100644
--- a/src/Native/Runtime/arm/AllocFast.S
+++ b/src/Native/Runtime/arm/AllocFast.S
@@ -112,24 +112,97 @@ LOCAL_LABEL(NewOutOfMemory):
NESTED_END RhpNewObject, _TEXT
+
+// Allocate a string.
+// r0 == EEType
+// r1 == element/character count
+LEAF_ENTRY RhpNewString, _TEXT
+ PROLOG_PUSH "{r4-r6,lr}"
+ // Make sure computing the overall allocation size won't overflow
+ MOV32 r12, ((0xFFFFFFFF - STRING_BASE_SIZE - 3) / STRING_COMPONENT_SIZE)
+ cmp r1, r12
+ bhi LOCAL_LABEL(StringSizeOverflow)
+
+ // Compute overall allocation size (align(base size + (element size * elements), 4)).
+ mov r2, #(STRING_BASE_SIZE + 3)
+#if STRING_COMPONENT_SIZE == 2
+ add r2, r2, r1, lsl #1 // r2 += characters * 2
+#else
+ NotImplementedComponentSize
+#endif
+ bic r2, r2, #3
+
+ mov r4, r0 // Save EEType
+ mov r5, r1 // Save element count
+ mov r6, r2 // Save string size
+ // r0 = GetThread()
+ INLINE_GETTHREAD
+ // r4 == EEType
+ // r5 == element count
+ // r6 == string size
+ // r0 == Thread*
+
+ // Load potential new object address into r12.
+ ldr r12, [r0, #OFFSETOF__Thread__m_alloc_context__alloc_ptr]
+
+ // Determine whether the end of the object would lie outside of the current allocation context. If so,
+ // we abandon the attempt to allocate the object directly and fall back to the slow helper.
+ adds r6, r12
+ bcs LOCAL_LABEL(RhpNewString_RarePath) // if we get a carry here, the string is too large to fit below 4 GB
+
+ ldr r12, [r0, #OFFSETOF__Thread__m_alloc_context__alloc_limit]
+ cmp r6, r12
+ bhi LOCAL_LABEL(RhpNewString_RarePath)
+
+ // Reload new object address into r12.
+ ldr r12, [r0, #OFFSETOF__Thread__m_alloc_context__alloc_ptr]
+
+ // Update the alloc pointer to account for the allocation.
+ str r6, [r0, #OFFSETOF__Thread__m_alloc_context__alloc_ptr]
+
+ // Set the new object's EEType pointer and element count.
+ str r4, [r12, #OFFSETOF__Object__m_pEEType]
+ str r5, [r12, #OFFSETOF__String__m_Length]
+
+ // Return the object allocated in r0.
+ mov r0, r12
+ EPILOG_POP "{r4-r6,pc}"
+
+LOCAL_LABEL(StringSizeOverflow):
+ // We get here if the size of the final string object can't be represented as an unsigned
+ // 32-bit value. We're going to tail-call to a managed helper that will throw
+ // an OOM exception that the caller of this allocator understands.
+
+ // EEType is in r0 already
+ mov r1, 0 // Indicate that we should throw OOM
+ EPILOG_POP "{r4-r6,lr}"
+ b C_FUNC(RhExceptionHandling_FailedAllocation)
+
+LOCAL_LABEL(RhpNewString_RarePath):
+ mov r3, r0
+ mov r0, r4
+ mov r1, r5
+ mov r2, r6
+ // r0 == EEType
+ // r1 == element count
+ // r2 == string size + Thread::m_alloc_context::alloc_ptr
+ // r3 == Thread
+ EPILOG_POP "{r4-r6,lr}"
+ b C_FUNC(RhpNewArrayRare)
+
+LEAF_END RhpNewString, _TEXT
+
+
// Allocate one dimensional, zero based array (SZARRAY).
// r0 == EEType
// r1 == element count
LEAF_ENTRY RhpNewArray, _TEXT
PROLOG_PUSH "{r4-r6,lr}"
- // we want to limit the element count to the non-negative 32-bit int range
- movw r12, #0xffff
- movt r12, #0x7fff
- cmp r1, r12
- bhi LOCAL_LABEL(ArraySizeOverflow)
- // if the element count is negative, it's an overflow error
- cmp r1, #0
- blt LOCAL_LABEL(ArraySizeOverflow)
// Compute overall allocation size (align(base size + (element size * elements), 4)).
// if the element count is <= 0x10000, no overflow is possible because the component
// size is <= 0xffff (it's an unsigned 16-bit value) and thus the product is <= 0xffff0000
- // and the base size is only 12 bytes.
+ // and the base size for the worst case (32 dimensional MdArray) is less than 0xffff.
ldrh r2, [r0, #OFFSETOF__EEType__m_usComponentSize]
cmp r1, #0x10000
bhi LOCAL_LABEL(ArraySizeBig)
@@ -177,6 +250,9 @@ LOCAL_LABEL(ArrayAlignSize):
EPILOG_POP "{r4-r6,pc}"
LOCAL_LABEL(ArraySizeBig):
+ // if the element count is negative, it's an overflow error
+ cmp r1, #0
+ blt LOCAL_LABEL(ArraySizeOverflow)
// now we know the element count is in the signed int range [0..0x7fffffff]
// overflow in computing the total size of the array size gives an out of memory exception,
diff --git a/src/Native/Runtime/arm/AllocFast.asm b/src/Native/Runtime/arm/AllocFast.asm
index 72977301b..82a3187d0 100644
--- a/src/Native/Runtime/arm/AllocFast.asm
+++ b/src/Native/Runtime/arm/AllocFast.asm
@@ -113,6 +113,71 @@ NewOutOfMemory
NESTED_END RhpNewObject
+;; Allocate a string.
+;; r0 == EEType
+;; r1 == element/character count
+ LEAF_ENTRY RhNewString
+
+ ; Make sure computing the overall allocation size won't overflow
+ MOV32 r2, ((0xFFFFFFFF - STRING_BASE_SIZE - 3) / STRING_COMPONENT_SIZE)
+ cmp r1, r2
+ bhs StringSizeOverflow
+
+ ; Compute overall allocation size (align(base size + (element size * elements), 4)).
+ mov r2, #(STRING_BASE_SIZE + 3)
+#if STRING_COMPONENT_SIZE == 2
+ add r2, r2, r1, lsl #1 ; r2 += characters * 2
+#else
+ NotImplementedComponentSize
+#endif
+ bic r2, r2, #3
+
+ ; r0 == EEType
+ ; r1 == element count
+ ; r2 == string size
+
+ INLINE_GETTHREAD r3, r12
+
+ ;; Load potential new object address into r12.
+ ldr r12, [r3, #OFFSETOF__Thread__m_alloc_context__alloc_ptr]
+
+ ;; Determine whether the end of the object would lie outside of the current allocation context. If so,
+ ;; we abandon the attempt to allocate the object directly and fall back to the slow helper.
+ adds r2, r12
+ bcs RhpNewArrayRare ; if we get a carry here, the array is too large to fit below 4 GB
+ ldr r12, [r3, #OFFSETOF__Thread__m_alloc_context__alloc_limit]
+ cmp r2, r12
+ bhi RhpNewArrayRare
+
+ ;; Reload new object address into r12.
+ ldr r12, [r3, #OFFSETOF__Thread__m_alloc_context__alloc_ptr]
+
+ ;; Update the alloc pointer to account for the allocation.
+ str r2, [r3, #OFFSETOF__Thread__m_alloc_context__alloc_ptr]
+
+ ;; Set the new object's EEType pointer and element count.
+ str r0, [r12, #OFFSETOF__Object__m_pEEType]
+ str r1, [r12, #OFFSETOF__String__m_Length]
+
+ ;; Return the object allocated in r0.
+ mov r0, r12
+
+ bx lr
+
+StringSizeOverflow
+ ; We get here if the size of the final string object can't be represented as an unsigned
+ ; 32-bit value. We're going to tail-call to a managed helper that will throw
+ ; an OOM exception that the caller of this allocator understands.
+
+ ; r0 holds EEType pointer already
+ mov r1, #0 ; Indicate that we should throw OOM.
+ b RhExceptionHandling_FailedAllocation
+
+ LEAF_END RhpNewString
+
+ INLINE_GETTHREAD_CONSTANT_POOL
+
+
;; Allocate one dimensional, zero based array (SZARRAY).
;; r0 == EEType
;; r1 == element count
@@ -121,7 +186,7 @@ NewOutOfMemory
; Compute overall allocation size (align(base size + (element size * elements), 4)).
; if the element count is <= 0x10000, no overflow is possible because the component
; size is <= 0xffff (it's an unsigned 16-bit value) and thus the product is <= 0xffff0000
- ; and the base size is only 12 bytes.
+ ; and the base size for the worst case (32 dimensional MdArray) is less than 0xffff.
ldrh r2, [r0, #OFFSETOF__EEType__m_usComponentSize]
cmp r1, #0x10000
bhi ArraySizeBig
diff --git a/src/Native/Runtime/arm/AsmMacros.h b/src/Native/Runtime/arm/AsmMacros.h
index 30ff96a34..1e64ddc76 100644
--- a/src/Native/Runtime/arm/AsmMacros.h
+++ b/src/Native/Runtime/arm/AsmMacros.h
@@ -236,6 +236,13 @@ $Name
#endif
MEND
+;; Loads a 32bit constant into destination register
+ MACRO
+ MOV32 $destReg, $constant
+
+ movw $destReg, #(($constant) & 0xFFFF)
+ movt $destReg, #(($constant) >> 16)
+ MEND
;;
;; CONSTANTS -- SYMBOLS