From 7a2404f9d39a4491c68d961142f435c82d66a614 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Thu, 12 Mar 2026 04:57:48 +0100 Subject: [PATCH 01/10] inline allocators --- src/coreclr/inc/corinfo.h | 30 +++++ src/coreclr/inc/icorjitinfoimpl_generated.h | 3 + src/coreclr/inc/jiteeversionguid.h | 10 +- src/coreclr/jit/ICorJitInfo_names_generated.h | 1 + .../jit/ICorJitInfo_wrapper_generated.hpp | 8 ++ src/coreclr/jit/codegen.h | 1 + src/coreclr/jit/codegenxarch.cpp | 110 ++++++++++++++++++ src/coreclr/jit/compiler.h | 14 +++ src/coreclr/jit/gtlist.h | 2 +- src/coreclr/jit/jitconfigvalues.h | 1 + src/coreclr/jit/lower.cpp | 6 + src/coreclr/jit/lsrabuild.cpp | 4 + src/coreclr/jit/lsraxarch.cpp | 19 +++ src/coreclr/jit/objectalloc.cpp | 25 +++- .../tools/Common/JitInterface/CorInfoImpl.cs | 6 + .../JitInterface/CorInfoImpl_generated.cs | 16 +++ .../tools/Common/JitInterface/CorInfoTypes.cs | 14 +++ .../ThunkGenerator/ThunkInput.txt | 2 + .../aot/jitinterface/jitinterface_generated.h | 9 ++ .../tools/superpmi/superpmi-shared/agnostic.h | 14 +++ .../tools/superpmi/superpmi-shared/lwmlist.h | 1 + .../superpmi-shared/methodcontext.cpp | 60 ++++++++++ .../superpmi/superpmi-shared/methodcontext.h | 5 + .../superpmi-shim-collector/icorjitinfo.cpp | 7 ++ .../icorjitinfo_generated.cpp | 7 ++ .../icorjitinfo_generated.cpp | 6 + .../tools/superpmi/superpmi/icorjitinfo.cpp | 6 + src/coreclr/vm/amd64/asmhelpers.S | 23 ++++ src/coreclr/vm/jitinterface.cpp | 58 +++++++++ src/coreclr/vm/threadstatics.cpp | 40 +++++++ src/coreclr/vm/threadstatics.h | 1 + 31 files changed, 498 insertions(+), 11 deletions(-) diff --git a/src/coreclr/inc/corinfo.h b/src/coreclr/inc/corinfo.h index 4dacf6cfad171b..df14401d462c2f 100644 --- a/src/coreclr/inc/corinfo.h +++ b/src/coreclr/inc/corinfo.h @@ -1695,6 +1695,33 @@ struct CORINFO_THREAD_STATIC_INFO_NATIVEAOT CORINFO_CONST_LOOKUP tlsGetAddrFtnPtr; }; +//---------------------------------------------------------------------------- +// getObjectAllocContextInfo and CORINFO_OBJECT_ALLOC_CONTEXT_INFO: The EE instructs the JIT +// about how to access the thread-local allocation context for inline object allocation. + +struct CORINFO_OBJECT_ALLOC_CONTEXT_INFO +{ + // Whether inline allocation is supported for this runtime configuration. + // False when: GCStress enabled, allocation tracking/sampling active, + // non-thread-local allocation contexts, etc. + bool supported; + + // Offsets within the ee_alloc_context structure + uint32_t allocPtrFieldOffset; // Offset of alloc_ptr + uint32_t combinedLimitFieldOffset; // Offset of combined_limit + + // Object/MethodTable layout offsets + uint32_t objectMethodTableOffset; // Offset of MethodTable* in Object + uint32_t methodTableBaseSizeOffset; // Offset of m_BaseSize in MethodTable + + // TLS access info (platform-specific) + CORINFO_CONST_LOOKUP tlsIndex; // Windows: address of _tls_index + uint32_t offsetOfThreadLocalStoragePointer; // Windows: TEB offset for TLS array (0x58 on x64) + CORINFO_CONST_LOOKUP tlsRoot; // TLS symbol for t_runtime_thread_locals (SECTIONREL/TLSGD/TLVP) + void* tlsGetAddrFtnPtr; // Linux: address of __tls_get_addr + void* threadVarsSection; // macOS: section address for TLVP +}; + //---------------------------------------------------------------------------- // Exception handling @@ -3207,6 +3234,9 @@ class ICorStaticInfo // Returns the primitive type for passing/returning a Wasm struct by value, // or CORINFO_WASM_TYPE_VOID if passing/returning must be by reference. virtual CorInfoWasmType getWasmLowering(CORINFO_CLASS_HANDLE structHnd) = 0; + + // Returns information about the thread-local allocation context for inline object allocation. + virtual void getObjectAllocContextInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) = 0; }; /***************************************************************************** diff --git a/src/coreclr/inc/icorjitinfoimpl_generated.h b/src/coreclr/inc/icorjitinfoimpl_generated.h index 7a67f9ea0ff3cd..3d060a9fa00e75 100644 --- a/src/coreclr/inc/icorjitinfoimpl_generated.h +++ b/src/coreclr/inc/icorjitinfoimpl_generated.h @@ -422,6 +422,9 @@ void getThreadLocalStaticBlocksInfo( void getThreadLocalStaticInfo_NativeAOT( CORINFO_THREAD_STATIC_INFO_NATIVEAOT* pInfo) override; +void getObjectAllocContextInfo( + CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) override; + bool isFieldStatic( CORINFO_FIELD_HANDLE fldHnd) override; diff --git a/src/coreclr/inc/jiteeversionguid.h b/src/coreclr/inc/jiteeversionguid.h index c87fdefeed1272..43fb9d3dc576ee 100644 --- a/src/coreclr/inc/jiteeversionguid.h +++ b/src/coreclr/inc/jiteeversionguid.h @@ -37,11 +37,11 @@ #include -constexpr GUID JITEEVersionIdentifier = { /* 89e70385-3f0d-4fbd-9270-0425c0db321b */ - 0x89e70385, - 0x3f0d, - 0x4fbd, - {0x92, 0x70, 0x04, 0x25, 0xc0, 0xdb, 0x32, 0x1b} +constexpr GUID JITEEVersionIdentifier = { /* 0deac13a-318b-4a61-826b-9fc3a2ec2a7d */ + 0x0deac13a, + 0x318b, + 0x4a61, + {0x82, 0x6b, 0x9f, 0xc3, 0xa2, 0xec, 0x2a, 0x7d} }; #endif // JIT_EE_VERSIONING_GUID_H diff --git a/src/coreclr/jit/ICorJitInfo_names_generated.h b/src/coreclr/jit/ICorJitInfo_names_generated.h index d03d03b1007970..d62bae363b36d0 100644 --- a/src/coreclr/jit/ICorJitInfo_names_generated.h +++ b/src/coreclr/jit/ICorJitInfo_names_generated.h @@ -106,6 +106,7 @@ DEF_CLR_API(getFieldInfo) DEF_CLR_API(getThreadLocalFieldInfo) DEF_CLR_API(getThreadLocalStaticBlocksInfo) DEF_CLR_API(getThreadLocalStaticInfo_NativeAOT) +DEF_CLR_API(getObjectAllocContextInfo) DEF_CLR_API(isFieldStatic) DEF_CLR_API(getArrayOrStringLength) DEF_CLR_API(getBoundaries) diff --git a/src/coreclr/jit/ICorJitInfo_wrapper_generated.hpp b/src/coreclr/jit/ICorJitInfo_wrapper_generated.hpp index 80411912d6c9cc..4fea0149300cdc 100644 --- a/src/coreclr/jit/ICorJitInfo_wrapper_generated.hpp +++ b/src/coreclr/jit/ICorJitInfo_wrapper_generated.hpp @@ -1005,6 +1005,14 @@ void WrapICorJitInfo::getThreadLocalStaticInfo_NativeAOT( API_LEAVE(getThreadLocalStaticInfo_NativeAOT); } +void WrapICorJitInfo::getObjectAllocContextInfo( + CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) +{ + API_ENTER(getObjectAllocContextInfo); + wrapHnd->getObjectAllocContextInfo(pInfo); + API_LEAVE(getObjectAllocContextInfo); +} + bool WrapICorJitInfo::isFieldStatic( CORINFO_FIELD_HANDLE fldHnd) { diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 8472dfe8c0233d..24835bded92e77 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1148,6 +1148,7 @@ class CodeGen final : public CodeGenInterface void genCodeForStoreLclFld(GenTreeLclFld* tree); void genCodeForStoreLclVar(GenTreeLclVar* tree); void genCodeForReturnTrap(GenTreeOp* tree); + void genCodeForAllocObj(GenTreeAllocObj* tree); void genCodeForStoreInd(GenTreeStoreInd* tree); void genCodeForSwap(GenTreeOp* tree); void genCodeForCpObj(GenTreeBlk* cpObjNode); diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 36a120250c6cf3..a23a6025ff7fc2 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1726,6 +1726,112 @@ void CodeGen::genCodeForReturnTrap(GenTreeOp* tree) genDefineTempLabel(skipLabel); } +//------------------------------------------------------------------------ +// genCodeForAllocObj: Generate code for GT_ALLOCOBJ - inline object allocation. +// +// Emits an inline bump-pointer allocation fast path with a slow-path +// fallback to CORINFO_HELP_NEWSFAST. The fast path accesses the thread-local +// ee_alloc_context, checks if there's enough space, bumps the allocation +// pointer, and sets the MethodTable on the new object. If there isn't enough +// space, it falls through to the slow path which calls the runtime helper. +// +// The entire fast path (from reading alloc_ptr to updating it) is marked +// as non-GC-interruptible via emitDisableGC/emitEnableGC. +// +void CodeGen::genCodeForAllocObj(GenTreeAllocObj* tree) +{ + assert(tree->OperIs(GT_ALLOCOBJ)); + + const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocInfo = m_compiler->compGetAllocContextInfo(); + assert(allocInfo->supported); + + regNumber dstReg = tree->GetRegNum(); + regNumber mtReg = genConsumeReg(tree->gtGetOp1()); + + // Get internal temp registers + regNumber allocCtxReg = internalRegisters.Extract(tree, RBM_ALLINT); + regNumber tmpReg = internalRegisters.Extract(tree, RBM_ALLINT); + + emitter* emit = GetEmitter(); + + // ---- TLS access: get pointer to ee_alloc_context ---- + if (TargetOS::IsWindows) + { + // Windows x64: gs:[0x58] -> TLS array -> [tls_index * 8] -> + SECTIONREL offset + + // mov allocCtxReg, gs:[offsetOfThreadLocalStoragePointer] (TEB.ThreadLocalStoragePointer) + emit->emitIns_R_C(INS_mov, EA_PTRSIZE, allocCtxReg, FLD_GLOBAL_GS, + (int)allocInfo->offsetOfThreadLocalStoragePointer); + + // mov tmpReg, [&_tls_index] -- load _tls_index value from its address + assert(allocInfo->tlsIndex.accessType == IAT_PVALUE); + instGen_Set_Reg_To_Imm(EA_PTRSIZE, tmpReg, (ssize_t)allocInfo->tlsIndex.addr); + emit->emitIns_R_AR(INS_mov, EA_4BYTE, tmpReg, tmpReg, 0); + + // mov allocCtxReg, [allocCtxReg + tmpReg * 8] (index into TLS array) + emit->emitIns_R_ARX(INS_mov, EA_PTRSIZE, allocCtxReg, allocCtxReg, tmpReg, 8, 0); + + // lea allocCtxReg, [allocCtxReg + SECTIONREL t_runtime_thread_locals] + assert(allocInfo->tlsRoot.accessType == IAT_VALUE); + ssize_t secrelOffset = (ssize_t)allocInfo->tlsRoot.addr; + emit->emitIns_R_AR(INS_lea, EA_PTRSIZE, allocCtxReg, allocCtxReg, (int)secrelOffset); + } + else + { + // Linux/macOS: not yet implemented - EE should return supported=false + unreached(); + } + + // ---- Disable GC for the allocation critical section ---- + GetEmitter()->emitDisableGC(); + + // ---- Bump pointer allocation ---- + // Read baseSize from MethodTable FIRST, before loading alloc_ptr into dstReg. + // LSRA guarantees dstReg != mtReg (via DelayFree), but reading baseSize first + // keeps the sequence clean and matches the runtime helper's pattern. + emit->emitIns_R_AR(INS_mov, EA_4BYTE, tmpReg, mtReg, (int)allocInfo->methodTableBaseSizeOffset); + + // dstReg = alloc_ptr (this will be the returned object pointer) + emit->emitIns_R_AR(INS_mov, EA_PTRSIZE, dstReg, allocCtxReg, (int)allocInfo->allocPtrFieldOffset); + + // tmpReg = alloc_ptr + baseSize (potential new alloc_ptr) + emit->emitIns_R_ARX(INS_lea, EA_PTRSIZE, tmpReg, dstReg, tmpReg, 1, 0); + + // Compare new alloc_ptr against combined_limit + emit->emitIns_R_AR(INS_cmp, EA_PTRSIZE, tmpReg, allocCtxReg, (int)allocInfo->combinedLimitFieldOffset); + + BasicBlock* slowPath = genCreateTempLabel(); + // If new alloc_ptr > combined_limit, go to slow path + inst_JMP(EJ_ja, slowPath); + + // ---- Fast path: allocation succeeded ---- + emit->emitIns_AR_R(INS_mov, EA_PTRSIZE, mtReg, dstReg, (int)allocInfo->objectMethodTableOffset); + emit->emitIns_AR_R(INS_mov, EA_PTRSIZE, tmpReg, allocCtxReg, (int)allocInfo->allocPtrFieldOffset); + + BasicBlock* done = genCreateTempLabel(); + inst_JMP(EJ_jmp, done); + + // ---- Slow path: call the allocation helper ---- + genDefineTempLabel(slowPath); + + regNumber mtArgReg = REG_ARG_0; + inst_Mov(TYP_I_IMPL, mtArgReg, mtReg, /* canSkip */ true); + gcInfo.gcMarkRegSetNpt(genRegMask(mtArgReg)); + + genEmitHelperCall(CORINFO_HELP_NEWSFAST, 0, EA_PTRSIZE); + + inst_Mov(TYP_REF, dstReg, REG_INTRET, /* canSkip */ true); + + // ---- Done ---- + genDefineTempLabel(done); + + // Re-enable GC after both paths converge + GetEmitter()->emitEnableGC(); + + gcInfo.gcMarkRegPtrVal(dstReg, TYP_REF); + genProduceReg(tree); +} + /***************************************************************************** * * Generate code for a single node in the tree. @@ -1807,6 +1913,10 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode) genLclHeap(treeNode); break; + case GT_ALLOCOBJ: + genCodeForAllocObj(treeNode->AsAllocObj()); + break; + case GT_CNS_INT: #ifdef TARGET_X86 assert(!treeNode->IsIconHandle(GTF_ICON_TLS_HDL)); diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index edadb7d14f7bfc..73073f8e0ea0c9 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -10278,6 +10278,10 @@ class Compiler bool compMaskConvertUsed = false; // Does the method have Convert Mask To Vector nodes. bool compUsesThrowHelper = false; // There is a call to a THROW_HELPER for the compiled method. + // Cached allocation context info for inline object allocation + CORINFO_OBJECT_ALLOC_CONTEXT_INFO compAllocContextInfo; + bool compAllocContextInfoInitialized = false; + // NOTE: These values are only reliable after // the importing is completely finished. @@ -11350,6 +11354,16 @@ class Compiler CORINFO_CONST_LOOKUP compGetHelperFtn(CorInfoHelpFunc ftnNum); + const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* compGetAllocContextInfo() + { + if (!compAllocContextInfoInitialized) + { + info.compCompHnd->getObjectAllocContextInfo(&compAllocContextInfo); + compAllocContextInfoInitialized = true; + } + return &compAllocContextInfo; + } + // Several JIT/EE interface functions return a CorInfoType, and also return a // class handle as an out parameter if the type is a value class. Returns the // size of the type these describe. diff --git a/src/coreclr/jit/gtlist.h b/src/coreclr/jit/gtlist.h index e8f61c1dea5263..cdddcbdc1dbbf7 100644 --- a/src/coreclr/jit/gtlist.h +++ b/src/coreclr/jit/gtlist.h @@ -91,7 +91,7 @@ GTNODE(ARR_LENGTH , GenTreeArrLen ,0,0,GTK_UNOP|GTK_EXOP) GTNODE(MDARR_LENGTH , GenTreeMDArr ,0,1,GTK_UNOP|GTK_EXOP) // multi-dimension (MD) array length of a specific dimension GTNODE(MDARR_LOWER_BOUND, GenTreeMDArr ,0,1,GTK_UNOP|GTK_EXOP) // multi-dimension (MD) array lower bound of a specific dimension GTNODE(FIELD_ADDR , GenTreeFieldAddr ,0,0,GTK_UNOP|GTK_EXOP|DBK_NOTLIR) // Field address -GTNODE(ALLOCOBJ , GenTreeAllocObj ,0,0,GTK_UNOP|GTK_EXOP|DBK_NOTLIR) // object allocator +GTNODE(ALLOCOBJ , GenTreeAllocObj ,0,0,GTK_UNOP|GTK_EXOP) // object allocator GTNODE(INIT_VAL , GenTreeOp ,0,1,GTK_UNOP) // Initialization value for an initBlk diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 7741e0f6b8fe19..82c784edded278 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -683,6 +683,7 @@ RELEASE_CONFIG_INTEGER(JitObjectStackAllocationConditionalEscape, "JitObjectStac CONFIG_STRING(JitObjectStackAllocationConditionalEscapeRange, "JitObjectStackAllocationConditionalEscapeRange") RELEASE_CONFIG_INTEGER(JitObjectStackAllocationArray, "JitObjectStackAllocationArray", 1) RELEASE_CONFIG_INTEGER(JitObjectStackAllocationSize, "JitObjectStackAllocationSize", 528) +RELEASE_CONFIG_INTEGER(JitInlineAllocFast, "JitInlineAllocFast", 1) RELEASE_CONFIG_INTEGER(JitObjectStackAllocationTrackFields, "JitObjectStackAllocationTrackFields", 1) CONFIG_STRING(JitObjectStackAllocationTrackFieldsRange, "JitObjectStackAllocationTrackFieldsRange") CONFIG_INTEGER(JitObjectStackAllocationDumpConnGraph, "JitObjectStackAllocationDumpConnGraph", 0) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index cbd4e5e814d4d3..d2f4746341763d 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -492,6 +492,12 @@ GenTree* Lowering::LowerNode(GenTree* node) ContainCheckReturnTrap(node->AsOp()); break; +#ifdef TARGET_XARCH + case GT_ALLOCOBJ: + // GT_ALLOCOBJ operand (MethodTable handle) must be in a register + break; +#endif + case GT_CAST: { GenTree* nextNode = node->gtNext; diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 79ddcc5b1c732e..ba858c2e72b1a5 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -1066,6 +1066,10 @@ regMaskTP LinearScan::getKillSetForNode(GenTree* tree) killMask = m_compiler->compHelperCallKillSet(CORINFO_HELP_STOP_FOR_GC); break; + case GT_ALLOCOBJ: + killMask = m_compiler->compHelperCallKillSet(CORINFO_HELP_NEWSFAST); + break; + case GT_CALL: killMask = getKillSetForCall(tree->AsCall()); diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index a0ee265e5abfa0..0e766318e2a864 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -315,6 +315,25 @@ int LinearScan::BuildNode(GenTree* tree) } break; + case GT_ALLOCOBJ: + { + // Inline object allocation: TLS access + bump pointer + slow-path helper call. + // Internal temps for TLS access intermediates and allocation calculations. + buildInternalIntRegisterDefForNode(tree); + buildInternalIntRegisterDefForNode(tree); + // The MethodTable operand must remain live until after we load alloc_ptr + // into the destination register (they may not alias). + srcCount = BuildDelayFreeUses(tree->gtGetOp1(), tree); + // Internal regs must not overlap with the destination register either, + // since allocCtxReg is used throughout the fast and slow paths. + setInternalRegsDelayFree = true; + buildInternalRegisterUses(); + killMask = m_compiler->compHelperCallKillSet(CORINFO_HELP_NEWSFAST); + BuildKills(tree, killMask); + BuildDef(tree); + } + break; + case GT_MOD: case GT_DIV: case GT_UMOD: diff --git a/src/coreclr/jit/objectalloc.cpp b/src/coreclr/jit/objectalloc.cpp index 894ed2de75b952..aa828b3e314213 100644 --- a/src/coreclr/jit/objectalloc.cpp +++ b/src/coreclr/jit/objectalloc.cpp @@ -1337,11 +1337,26 @@ void ObjectAllocator::MorphAllocObjNode(AllocationCandidate& candidate) candidate.m_onHeapReason); if ((candidate.m_allocType == OAT_NEWOBJ) || (candidate.m_allocType == OAT_NEWOBJ_HEAP)) { - GenTree* const stmtExpr = candidate.m_tree; - GenTree* const oldData = stmtExpr->AsLclVar()->Data(); - GenTree* const newData = MorphAllocObjNodeIntoHelperCall(oldData->AsAllocObj()); - stmtExpr->AsLclVar()->Data() = newData; - stmtExpr->AddAllEffectsFlags(newData); + GenTree* const stmtExpr = candidate.m_tree; + GenTreeAllocObj* allocObj = stmtExpr->AsLclVar()->Data()->AsAllocObj(); + +#ifdef TARGET_XARCH + // Check if we can keep GT_ALLOCOBJ for inline allocation expansion in codegen + const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocCtxInfo = m_compiler->compGetAllocContextInfo(); + if (allocObj->gtNewHelper == CORINFO_HELP_NEWSFAST && !allocObj->gtHelperHasSideEffects && + allocCtxInfo->supported && TargetOS::IsWindows && m_compiler->opts.OptimizationEnabled() && + JitConfig.JitInlineAllocFast() != 0) + { + JITDUMP("Keeping GT_ALLOCOBJ [%06u] for inline allocation expansion\n", + m_compiler->dspTreeID(allocObj)); + } + else +#endif // TARGET_XARCH + { + GenTree* const newData = MorphAllocObjNodeIntoHelperCall(allocObj); + stmtExpr->AsLclVar()->Data() = newData; + stmtExpr->AddAllEffectsFlags(newData); + } } if (IsTrackedLocal(lclNum)) diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs b/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs index bcc358cb7a791c..cf9ab78e0d3df2 100644 --- a/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs +++ b/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs @@ -3709,6 +3709,12 @@ private CorInfoWasmType getWasmLowering(CORINFO_CLASS_STRUCT_* structHnd) private uint getThreadTLSIndex(ref void* ppIndirection) { throw new NotImplementedException("getThreadTLSIndex"); } + private void getObjectAllocContextInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) + { + // NativeAOT/crossgen2: not yet implemented + pInfo->supported = 0; + } + private Dictionary _helperCache = new Dictionary(); private void getHelperFtn(CorInfoHelpFunc ftnNum, CORINFO_CONST_LOOKUP *pNativeEntrypoint, CORINFO_METHOD_STRUCT_** pMethod) { diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoImpl_generated.cs b/src/coreclr/tools/Common/JitInterface/CorInfoImpl_generated.cs index a1eebc3762aaa1..1f5ade80580888 100644 --- a/src/coreclr/tools/Common/JitInterface/CorInfoImpl_generated.cs +++ b/src/coreclr/tools/Common/JitInterface/CorInfoImpl_generated.cs @@ -122,6 +122,7 @@ static ICorJitInfoCallbacks() s_callbacks.getThreadLocalFieldInfo = &_getThreadLocalFieldInfo; s_callbacks.getThreadLocalStaticBlocksInfo = &_getThreadLocalStaticBlocksInfo; s_callbacks.getThreadLocalStaticInfo_NativeAOT = &_getThreadLocalStaticInfo_NativeAOT; + s_callbacks.getObjectAllocContextInfo = &_getObjectAllocContextInfo; s_callbacks.isFieldStatic = &_isFieldStatic; s_callbacks.getArrayOrStringLength = &_getArrayOrStringLength; s_callbacks.getBoundaries = &_getBoundaries; @@ -304,6 +305,7 @@ static ICorJitInfoCallbacks() public delegate* unmanaged getThreadLocalFieldInfo; public delegate* unmanaged getThreadLocalStaticBlocksInfo; public delegate* unmanaged getThreadLocalStaticInfo_NativeAOT; + public delegate* unmanaged getObjectAllocContextInfo; public delegate* unmanaged isFieldStatic; public delegate* unmanaged getArrayOrStringLength; public delegate* unmanaged getBoundaries; @@ -1899,6 +1901,20 @@ private static void _getThreadLocalStaticInfo_NativeAOT(IntPtr thisHandle, IntPt } } + [UnmanagedCallersOnly] + private static void _getObjectAllocContextInfo(IntPtr thisHandle, IntPtr* ppException, CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) + { + var _this = GetThis(thisHandle); + try + { + _this.getObjectAllocContextInfo(pInfo); + } + catch (Exception ex) + { + *ppException = _this.AllocException(ex); + } + } + [UnmanagedCallersOnly] private static byte _isFieldStatic(IntPtr thisHandle, IntPtr* ppException, CORINFO_FIELD_STRUCT_* fldHnd) { diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs b/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs index 4ba39d65fefc2c..e3868d9e729b36 100644 --- a/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs +++ b/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs @@ -1276,6 +1276,20 @@ public unsafe struct CORINFO_THREAD_STATIC_INFO_NATIVEAOT public CORINFO_CONST_LOOKUP tlsGetAddrFtnPtr; }; + public unsafe struct CORINFO_OBJECT_ALLOC_CONTEXT_INFO + { + public byte supported; + public uint allocPtrFieldOffset; + public uint combinedLimitFieldOffset; + public uint objectMethodTableOffset; + public uint methodTableBaseSizeOffset; + public CORINFO_CONST_LOOKUP tlsIndex; + public uint offsetOfThreadLocalStoragePointer; + public CORINFO_CONST_LOOKUP tlsRoot; + public nuint tlsGetAddrFtnPtr; + public nuint threadVarsSection; + }; + // System V struct passing // The Classification types are described in the ABI spec at https://software.intel.com/sites/default/files/article/402129/mpx-linux64-abi.pdf public enum SystemVClassificationType : byte diff --git a/src/coreclr/tools/Common/JitInterface/ThunkGenerator/ThunkInput.txt b/src/coreclr/tools/Common/JitInterface/ThunkGenerator/ThunkInput.txt index 8e605002f90174..4eeb7087d9f945 100644 --- a/src/coreclr/tools/Common/JitInterface/ThunkGenerator/ThunkInput.txt +++ b/src/coreclr/tools/Common/JitInterface/ThunkGenerator/ThunkInput.txt @@ -92,6 +92,7 @@ CORINFO_METHOD_INFO*,CORINFO_METHOD_INFO* CORINFO_FIELD_INFO*,CORINFO_FIELD_INFO* CORINFO_THREAD_STATIC_BLOCKS_INFO*,CORINFO_THREAD_STATIC_BLOCKS_INFO* CORINFO_THREAD_STATIC_INFO_NATIVEAOT*,CORINFO_THREAD_STATIC_INFO_NATIVEAOT* +CORINFO_OBJECT_ALLOC_CONTEXT_INFO*,CORINFO_OBJECT_ALLOC_CONTEXT_INFO* CORINFO_CALL_INFO*,CORINFO_CALL_INFO* CORINFO_DEVIRTUALIZATION_INFO*,CORINFO_DEVIRTUALIZATION_INFO* CORINFO_TYPE_LAYOUT_NODE*,CORINFO_TYPE_LAYOUT_NODE* @@ -275,6 +276,7 @@ FUNCTIONS uint32_t getThreadLocalFieldInfo (CORINFO_FIELD_HANDLE field, bool isGCtype) void getThreadLocalStaticBlocksInfo (CORINFO_THREAD_STATIC_BLOCKS_INFO* pInfo) void getThreadLocalStaticInfo_NativeAOT(CORINFO_THREAD_STATIC_INFO_NATIVEAOT* pInfo) + void getObjectAllocContextInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) bool isFieldStatic(CORINFO_FIELD_HANDLE fldHnd) int getArrayOrStringLength(CORINFO_OBJECT_HANDLE objHnd) void getBoundaries(CORINFO_METHOD_HANDLE ftn, unsigned int* cILOffsets, uint32_t** pILOffsets, ICorDebugInfo::BoundaryTypes* implicitBoundaries) diff --git a/src/coreclr/tools/aot/jitinterface/jitinterface_generated.h b/src/coreclr/tools/aot/jitinterface/jitinterface_generated.h index dec68b1eb53d4f..63eafb403db84a 100644 --- a/src/coreclr/tools/aot/jitinterface/jitinterface_generated.h +++ b/src/coreclr/tools/aot/jitinterface/jitinterface_generated.h @@ -113,6 +113,7 @@ struct JitInterfaceCallbacks uint32_t (* getThreadLocalFieldInfo)(void * thisHandle, CorInfoExceptionClass** ppException, CORINFO_FIELD_HANDLE field, bool isGCtype); void (* getThreadLocalStaticBlocksInfo)(void * thisHandle, CorInfoExceptionClass** ppException, CORINFO_THREAD_STATIC_BLOCKS_INFO* pInfo); void (* getThreadLocalStaticInfo_NativeAOT)(void * thisHandle, CorInfoExceptionClass** ppException, CORINFO_THREAD_STATIC_INFO_NATIVEAOT* pInfo); + void (* getObjectAllocContextInfo)(void * thisHandle, CorInfoExceptionClass** ppException, CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo); bool (* isFieldStatic)(void * thisHandle, CorInfoExceptionClass** ppException, CORINFO_FIELD_HANDLE fldHnd); int (* getArrayOrStringLength)(void * thisHandle, CorInfoExceptionClass** ppException, CORINFO_OBJECT_HANDLE objHnd); void (* getBoundaries)(void * thisHandle, CorInfoExceptionClass** ppException, CORINFO_METHOD_HANDLE ftn, unsigned int* cILOffsets, uint32_t** pILOffsets, ICorDebugInfo::BoundaryTypes* implicitBoundaries); @@ -1199,6 +1200,14 @@ class JitInterfaceWrapper : public ICorJitInfo if (pException != nullptr) throw pException; } + virtual void getObjectAllocContextInfo( + CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) +{ + CorInfoExceptionClass* pException = nullptr; + _callbacks->getObjectAllocContextInfo(_thisHandle, &pException, pInfo); + if (pException != nullptr) throw pException; +} + virtual bool isFieldStatic( CORINFO_FIELD_HANDLE fldHnd) { diff --git a/src/coreclr/tools/superpmi/superpmi-shared/agnostic.h b/src/coreclr/tools/superpmi/superpmi-shared/agnostic.h index 7dc7ca9fcf5211..a67e38cb41c1e9 100644 --- a/src/coreclr/tools/superpmi/superpmi-shared/agnostic.h +++ b/src/coreclr/tools/superpmi/superpmi-shared/agnostic.h @@ -572,6 +572,20 @@ struct Agnostic_GetThreadStaticInfo_NativeAOT Agnostic_CORINFO_CONST_LOOKUP tlsGetAddrFtnPtr; }; +struct Agnostic_GetObjectAllocContextInfo +{ + DWORD supported; + DWORD allocPtrFieldOffset; + DWORD combinedLimitFieldOffset; + DWORD objectMethodTableOffset; + DWORD methodTableBaseSizeOffset; + Agnostic_CORINFO_CONST_LOOKUP tlsIndex; + DWORD offsetOfThreadLocalStoragePointer; + Agnostic_CORINFO_CONST_LOOKUP tlsRoot; + DWORDLONG tlsGetAddrFtnPtr; + DWORDLONG threadVarsSection; +}; + struct Agnostic_GetClassCtorInitializationInfo { Agnostic_CORINFO_CONST_LOOKUP addr; diff --git a/src/coreclr/tools/superpmi/superpmi-shared/lwmlist.h b/src/coreclr/tools/superpmi/superpmi-shared/lwmlist.h index b464feef60b3f6..b0edb58e552b35 100644 --- a/src/coreclr/tools/superpmi/superpmi-shared/lwmlist.h +++ b/src/coreclr/tools/superpmi/superpmi-shared/lwmlist.h @@ -173,6 +173,7 @@ LWM(SatisfiesMethodConstraints, DLDL, DWORD) LWM(GetUnmanagedCallConv, MethodOrSigInfoValue, DD) LWM(DoesFieldBelongToClass, DLDL, DWORD) DENSELWM(SigInstHandleMap, DWORDLONG) +LWM(GetObjectAllocContextInfo, DWORD, Agnostic_GetObjectAllocContextInfo) LWM(GetWasmTypeSymbol, Agnostic_GetWasmTypeSymbol, DWORDLONG) #undef LWM diff --git a/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.cpp b/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.cpp index fada9c7b16cbfa..3d40a3ee28fe12 100644 --- a/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.cpp +++ b/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.cpp @@ -7806,3 +7806,63 @@ void SetDebugDumpVariables() g_debugRep = true; } } + +void MethodContext::recGetObjectAllocContextInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) +{ + if (GetObjectAllocContextInfo == nullptr) + GetObjectAllocContextInfo = new LightWeightMap(); + + Agnostic_GetObjectAllocContextInfo value; + ZeroMemory(&value, sizeof(value)); + + value.supported = pInfo->supported ? 1 : 0; + value.allocPtrFieldOffset = pInfo->allocPtrFieldOffset; + value.combinedLimitFieldOffset = pInfo->combinedLimitFieldOffset; + value.objectMethodTableOffset = pInfo->objectMethodTableOffset; + value.methodTableBaseSizeOffset = pInfo->methodTableBaseSizeOffset; + value.tlsIndex = SpmiRecordsHelper::StoreAgnostic_CORINFO_CONST_LOOKUP(&pInfo->tlsIndex); + value.offsetOfThreadLocalStoragePointer = pInfo->offsetOfThreadLocalStoragePointer; + value.tlsRoot = SpmiRecordsHelper::StoreAgnostic_CORINFO_CONST_LOOKUP(&pInfo->tlsRoot); + value.tlsGetAddrFtnPtr = CastPointer(pInfo->tlsGetAddrFtnPtr); + value.threadVarsSection = CastPointer(pInfo->threadVarsSection); + + DWORD key = 0; + GetObjectAllocContextInfo->Add(key, value); + DEBUG_REC(dmpGetObjectAllocContextInfo(key, value)); +} + +void MethodContext::dmpGetObjectAllocContextInfo(DWORD key, const Agnostic_GetObjectAllocContextInfo& value) +{ + printf("GetObjectAllocContextInfo key %u, supported-%u" + ", allocPtrFieldOffset-%u, combinedLimitFieldOffset-%u" + ", objectMethodTableOffset-%u, methodTableBaseSizeOffset-%u" + ", tlsIndex-%s, offsetOfThreadLocalStoragePointer-%u" + ", tlsRoot-%s, tlsGetAddrFtnPtr-%016" PRIX64 + ", threadVarsSection-%016" PRIX64, + key, value.supported, + value.allocPtrFieldOffset, value.combinedLimitFieldOffset, + value.objectMethodTableOffset, value.methodTableBaseSizeOffset, + SpmiDumpHelper::DumpAgnostic_CORINFO_CONST_LOOKUP(value.tlsIndex).c_str(), + value.offsetOfThreadLocalStoragePointer, + SpmiDumpHelper::DumpAgnostic_CORINFO_CONST_LOOKUP(value.tlsRoot).c_str(), + value.tlsGetAddrFtnPtr, value.threadVarsSection); +} + +void MethodContext::repGetObjectAllocContextInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) +{ + DWORD key = 0; + Agnostic_GetObjectAllocContextInfo value = LookupByKeyOrMiss(GetObjectAllocContextInfo, key, ": key %u", key); + + DEBUG_REP(dmpGetObjectAllocContextInfo(key, value)); + + pInfo->supported = (value.supported != 0); + pInfo->allocPtrFieldOffset = value.allocPtrFieldOffset; + pInfo->combinedLimitFieldOffset = value.combinedLimitFieldOffset; + pInfo->objectMethodTableOffset = value.objectMethodTableOffset; + pInfo->methodTableBaseSizeOffset = value.methodTableBaseSizeOffset; + pInfo->tlsIndex = SpmiRecordsHelper::RestoreCORINFO_CONST_LOOKUP(value.tlsIndex); + pInfo->offsetOfThreadLocalStoragePointer = value.offsetOfThreadLocalStoragePointer; + pInfo->tlsRoot = SpmiRecordsHelper::RestoreCORINFO_CONST_LOOKUP(value.tlsRoot); + pInfo->tlsGetAddrFtnPtr = (void*)value.tlsGetAddrFtnPtr; + pInfo->threadVarsSection = (void*)value.threadVarsSection; +} diff --git a/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.h b/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.h index 067b2154d1e52f..bfc34c56b3a17f 100644 --- a/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.h +++ b/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.h @@ -501,6 +501,10 @@ class MethodContext void dmpGetThreadLocalStaticInfo_NativeAOT(DWORDLONG key, const Agnostic_GetThreadStaticInfo_NativeAOT& value); void repGetThreadLocalStaticInfo_NativeAOT(CORINFO_THREAD_STATIC_INFO_NATIVEAOT* pInfo); + void recGetObjectAllocContextInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo); + void dmpGetObjectAllocContextInfo(DWORD key, const Agnostic_GetObjectAllocContextInfo& value); + void repGetObjectAllocContextInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo); + void recEmbedMethodHandle(CORINFO_METHOD_HANDLE handle, void** ppIndirection, CORINFO_METHOD_HANDLE result); void dmpEmbedMethodHandle(DWORDLONG key, DLDL value); CORINFO_METHOD_HANDLE repEmbedMethodHandle(CORINFO_METHOD_HANDLE handle, void** ppIndirection); @@ -1222,6 +1226,7 @@ enum mcPackets Packet_GetWasmTypeSymbol = 235, Packet_GetWasmLowering = 236, Packet_GetAsyncOtherVariant = 237, + Packet_GetObjectAllocContextInfo = 238, }; void SetDebugDumpVariables(); diff --git a/src/coreclr/tools/superpmi/superpmi-shim-collector/icorjitinfo.cpp b/src/coreclr/tools/superpmi/superpmi-shim-collector/icorjitinfo.cpp index e29ba7b5cc97ff..ad9dc09d510a73 100644 --- a/src/coreclr/tools/superpmi/superpmi-shim-collector/icorjitinfo.cpp +++ b/src/coreclr/tools/superpmi/superpmi-shim-collector/icorjitinfo.cpp @@ -1131,6 +1131,13 @@ void interceptor_ICJI::getThreadLocalStaticInfo_NativeAOT(CORINFO_THREAD_STATIC_ mc->recGetThreadLocalStaticInfo_NativeAOT(pInfo); } +void interceptor_ICJI::getObjectAllocContextInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) +{ + mc->cr->AddCall("getObjectAllocContextInfo"); + original_ICorJitInfo->getObjectAllocContextInfo(pInfo); + mc->recGetObjectAllocContextInfo(pInfo); +} + // Returns true iff "fldHnd" represents a static field. bool interceptor_ICJI::isFieldStatic(CORINFO_FIELD_HANDLE fldHnd) { diff --git a/src/coreclr/tools/superpmi/superpmi-shim-counter/icorjitinfo_generated.cpp b/src/coreclr/tools/superpmi/superpmi-shim-counter/icorjitinfo_generated.cpp index 8058b6802159e8..26e1b79dda6430 100644 --- a/src/coreclr/tools/superpmi/superpmi-shim-counter/icorjitinfo_generated.cpp +++ b/src/coreclr/tools/superpmi/superpmi-shim-counter/icorjitinfo_generated.cpp @@ -821,6 +821,13 @@ void interceptor_ICJI::getThreadLocalStaticInfo_NativeAOT( original_ICorJitInfo->getThreadLocalStaticInfo_NativeAOT(pInfo); } +void interceptor_ICJI::getObjectAllocContextInfo( + CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) +{ + mcs->AddCall("getObjectAllocContextInfo"); + original_ICorJitInfo->getObjectAllocContextInfo(pInfo); +} + bool interceptor_ICJI::isFieldStatic( CORINFO_FIELD_HANDLE fldHnd) { diff --git a/src/coreclr/tools/superpmi/superpmi-shim-simple/icorjitinfo_generated.cpp b/src/coreclr/tools/superpmi/superpmi-shim-simple/icorjitinfo_generated.cpp index 852a318f83c225..b0b2a30c542505 100644 --- a/src/coreclr/tools/superpmi/superpmi-shim-simple/icorjitinfo_generated.cpp +++ b/src/coreclr/tools/superpmi/superpmi-shim-simple/icorjitinfo_generated.cpp @@ -719,6 +719,12 @@ void interceptor_ICJI::getThreadLocalStaticInfo_NativeAOT( original_ICorJitInfo->getThreadLocalStaticInfo_NativeAOT(pInfo); } +void interceptor_ICJI::getObjectAllocContextInfo( + CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) +{ + original_ICorJitInfo->getObjectAllocContextInfo(pInfo); +} + bool interceptor_ICJI::isFieldStatic( CORINFO_FIELD_HANDLE fldHnd) { diff --git a/src/coreclr/tools/superpmi/superpmi/icorjitinfo.cpp b/src/coreclr/tools/superpmi/superpmi/icorjitinfo.cpp index 20372434076341..5d54a7aefe369a 100644 --- a/src/coreclr/tools/superpmi/superpmi/icorjitinfo.cpp +++ b/src/coreclr/tools/superpmi/superpmi/icorjitinfo.cpp @@ -937,6 +937,12 @@ void MyICJI::getThreadLocalStaticInfo_NativeAOT(CORINFO_THREAD_STATIC_INFO_NATIV jitInstance->mc->repGetThreadLocalStaticInfo_NativeAOT(pInfo); } +void MyICJI::getObjectAllocContextInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) +{ + jitInstance->mc->cr->AddCall("getObjectAllocContextInfo"); + jitInstance->mc->repGetObjectAllocContextInfo(pInfo); +} + // Returns true iff "fldHnd" represents a static field. bool MyICJI::isFieldStatic(CORINFO_FIELD_HANDLE fldHnd) { diff --git a/src/coreclr/vm/amd64/asmhelpers.S b/src/coreclr/vm/amd64/asmhelpers.S index 2d71f26409bc41..9562a3741653e4 100644 --- a/src/coreclr/vm/amd64/asmhelpers.S +++ b/src/coreclr/vm/amd64/asmhelpers.S @@ -323,6 +323,29 @@ LEAF_END GetTlsIndexObjectDescOffset, _TEXT #endif // !TARGET_ANDROID #endif // !TARGET_OSX +#ifdef TARGET_APPLE +# EXTERN_C void* GetRuntimeThreadLocalsThreadVarsAddress() +LEAF_ENTRY GetRuntimeThreadLocalsThreadVarsAddress, _TEXT + mov rdi, _t_runtime_thread_locals@TLVP[rip] + ret +LEAF_END GetRuntimeThreadLocalsThreadVarsAddress, _TEXT +#endif // TARGET_APPLE + +#ifndef TARGET_APPLE +#ifndef TARGET_ANDROID +# EXTERN_C void* GetRuntimeThreadLocalsTlsIndexObjectDescOffset(); +LEAF_ENTRY GetRuntimeThreadLocalsTlsIndexObjectDescOffset, _TEXT + .byte 0x66 + lea rdi, t_runtime_thread_locals@TLSGD[rip] + .byte 0x66 + .byte 0x66 + .byte 0x48 # rex.W prefix for padding + call EXTERNAL_C_FUNC(__tls_get_addr) + int 3 +LEAF_END GetRuntimeThreadLocalsTlsIndexObjectDescOffset, _TEXT +#endif // !TARGET_ANDROID +#endif // !TARGET_APPLE + LEAF_ENTRY JIT_PollGC, _TEXT PREPARE_EXTERNAL_VAR g_TrapReturningThreads, rax cmp dword ptr [rax], 0 diff --git a/src/coreclr/vm/jitinterface.cpp b/src/coreclr/vm/jitinterface.cpp index ea24d0728a173f..926f755e3acea1 100644 --- a/src/coreclr/vm/jitinterface.cpp +++ b/src/coreclr/vm/jitinterface.cpp @@ -1329,6 +1329,64 @@ void CEEInfo::getThreadLocalStaticBlocksInfo (CORINFO_THREAD_STATIC_BLOCKS_INFO* EE_TO_JIT_TRANSITION(); } +/*********************************************************************/ +void CEEInfo::getObjectAllocContextInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) +{ + CONTRACTL { + THROWS; + GC_TRIGGERS; + MODE_PREEMPTIVE; + } CONTRACTL_END; + + JIT_TO_EE_TRANSITION(); + + memset(pInfo, 0, sizeof(*pInfo)); + + // Inline allocation is only supported when thread allocation contexts are used, + // and when GC stress, allocation tracking, and allocation sampling are not active. + if (!GCHeapUtilities::UseThreadAllocationContexts()) + { + pInfo->supported = false; + } +#ifdef STRESS_HEAP + else if (GCStress::IsEnabled()) + { + pInfo->supported = false; + } +#endif + else if (CORProfilerTrackAllocations() || CORProfilerTrackAllocationsEnabled()) + { + pInfo->supported = false; + } + else if (ee_alloc_context::IsRandomizedSamplingEnabled()) + { + pInfo->supported = false; + } +#ifdef FEATURE_EVENT_TRACE + else if (ETW::TypeSystemLog::IsHeapAllocEventEnabled()) + { + pInfo->supported = false; + } +#endif + else + { + pInfo->supported = true; + + // ee_alloc_context offsets + pInfo->allocPtrFieldOffset = (uint32_t)(offsetof(ee_alloc_context, m_GCAllocContext) + offsetof(gc_alloc_context, alloc_ptr)); + pInfo->combinedLimitFieldOffset = (uint32_t)offsetof(ee_alloc_context, m_CombinedLimit); + + // Object/MethodTable layout + pInfo->objectMethodTableOffset = (uint32_t)cdac_data::m_pMethTab; + pInfo->methodTableBaseSizeOffset = (uint32_t)cdac_data::BaseSize; + + // TLS access info - how to reach t_runtime_thread_locals + GetObjectAllocContextTlsInfo(pInfo); + } + + EE_TO_JIT_TRANSITION(); +} + /*********************************************************************/ void CEEInfo::getFieldInfo (CORINFO_RESOLVED_TOKEN * pResolvedToken, CORINFO_METHOD_HANDLE callerHandle, diff --git a/src/coreclr/vm/threadstatics.cpp b/src/coreclr/vm/threadstatics.cpp index 23d8efd5be9fc2..76ee48fb8dd4ea 100644 --- a/src/coreclr/vm/threadstatics.cpp +++ b/src/coreclr/vm/threadstatics.cpp @@ -1068,6 +1068,7 @@ static uint32_t ThreadLocalOffset(void* p) } #elif defined(TARGET_APPLE) extern "C" void* GetThreadVarsAddress(); +extern "C" void* GetRuntimeThreadLocalsThreadVarsAddress(); static void* GetThreadVarsSectionAddressFromDesc(uint8_t* p) { @@ -1108,6 +1109,7 @@ static void* GetThreadVarsSectionAddress() #ifdef TARGET_AMD64 extern "C" void* GetTlsIndexObjectDescOffset(); +extern "C" void* GetRuntimeThreadLocalsTlsIndexObjectDescOffset(); static void* GetThreadStaticDescriptor(uint8_t* p) { @@ -1192,6 +1194,44 @@ void GetThreadLocalStaticBlocksInfo(CORINFO_THREAD_STATIC_BLOCKS_INFO* pInfo) pInfo->offsetOfBaseOfThreadLocalData = (uint32_t)threadStaticBaseOffset; #endif // !TARGET_ANDROID } + +/*********************************************************************/ +// Returns TLS access information for t_runtime_thread_locals (the +// thread-local allocation context) so the JIT can inline object allocation. +void GetObjectAllocContextTlsInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) +{ +#if !defined(TARGET_ANDROID) + STANDARD_VM_CONTRACT; + +#if defined(TARGET_WINDOWS) + pInfo->tlsIndex.addr = (void*)&_tls_index; + pInfo->tlsIndex.accessType = IAT_PVALUE; + pInfo->offsetOfThreadLocalStoragePointer = offsetof(_TEB, ThreadLocalStoragePointer); + pInfo->tlsRoot.addr = (void*)(uintptr_t)ThreadLocalOffset(&t_runtime_thread_locals); + pInfo->tlsRoot.accessType = IAT_VALUE; + +#elif defined(TARGET_APPLE) && defined(TARGET_AMD64) + uint8_t* p = reinterpret_cast(&GetRuntimeThreadLocalsThreadVarsAddress); + pInfo->threadVarsSection = GetThreadVarsSectionAddressFromDesc(p); + +#elif defined(TARGET_AMD64) + pInfo->tlsGetAddrFtnPtr = reinterpret_cast(&__tls_get_addr); + uint8_t* p = reinterpret_cast(&GetRuntimeThreadLocalsTlsIndexObjectDescOffset); + pInfo->tlsRoot.addr = GetThreadStaticDescriptor(p); + pInfo->tlsRoot.accessType = IAT_VALUE; + if (pInfo->tlsRoot.addr == nullptr) + { + pInfo->supported = false; + } + +#else + pInfo->supported = false; +#endif // TARGET_WINDOWS + +#else // TARGET_ANDROID + pInfo->supported = false; +#endif // !TARGET_ANDROID +} #endif // !DACCESS_COMPILE #ifdef DACCESS_COMPILE diff --git a/src/coreclr/vm/threadstatics.h b/src/coreclr/vm/threadstatics.h index afbb3f25039031..c12d423350349d 100644 --- a/src/coreclr/vm/threadstatics.h +++ b/src/coreclr/vm/threadstatics.h @@ -347,6 +347,7 @@ void GetTLSIndexForThreadStatic(MethodTable* pMT, bool gcStatic, TLSIndex* pInde void FreeTLSIndicesForLoaderAllocator(LoaderAllocator *pLoaderAllocator); void* GetThreadLocalStaticBase(TLSIndex index); void GetThreadLocalStaticBlocksInfo (CORINFO_THREAD_STATIC_BLOCKS_INFO* pInfo); +void GetObjectAllocContextTlsInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo); bool CanJITOptimizeTLSAccess(); #else void EnumThreadMemoryRegions(ThreadLocalData* pThreadLocalData, CLRDataEnumMemoryFlags flags); From 87795eeb8cea6f16725e4bb2b18ca05ba504f13e Mon Sep 17 00:00:00 2001 From: EgorBo Date: Thu, 12 Mar 2026 05:17:16 +0100 Subject: [PATCH 02/10] feedback --- src/coreclr/inc/corinfo.h | 5 ++-- src/coreclr/jit/codegenxarch.cpp | 51 +++++++++++++------------------- src/coreclr/jit/objectalloc.cpp | 7 +++-- 3 files changed, 28 insertions(+), 35 deletions(-) diff --git a/src/coreclr/inc/corinfo.h b/src/coreclr/inc/corinfo.h index df14401d462c2f..56a53da48a90da 100644 --- a/src/coreclr/inc/corinfo.h +++ b/src/coreclr/inc/corinfo.h @@ -1715,9 +1715,10 @@ struct CORINFO_OBJECT_ALLOC_CONTEXT_INFO uint32_t methodTableBaseSizeOffset; // Offset of m_BaseSize in MethodTable // TLS access info (platform-specific) - CORINFO_CONST_LOOKUP tlsIndex; // Windows: address of _tls_index + CORINFO_CONST_LOOKUP tlsIndex; // Windows: address of _tls_index (IAT_PVALUE) uint32_t offsetOfThreadLocalStoragePointer; // Windows: TEB offset for TLS array (0x58 on x64) - CORINFO_CONST_LOOKUP tlsRoot; // TLS symbol for t_runtime_thread_locals (SECTIONREL/TLSGD/TLVP) + CORINFO_CONST_LOOKUP tlsRoot; // Windows: byte offset from the module TLS base to t_runtime_thread_locals (IAT_VALUE); + // Linux: TLSGD descriptor address; macOS: TLVP descriptor address void* tlsGetAddrFtnPtr; // Linux: address of __tls_get_addr void* threadVarsSection; // macOS: section address for TLVP }; diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index a23a6025ff7fc2..5d4e90bc3fbee6 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1755,40 +1755,31 @@ void CodeGen::genCodeForAllocObj(GenTreeAllocObj* tree) emitter* emit = GetEmitter(); // ---- TLS access: get pointer to ee_alloc_context ---- - if (TargetOS::IsWindows) - { - // Windows x64: gs:[0x58] -> TLS array -> [tls_index * 8] -> + SECTIONREL offset + // Currently only Windows x64 is supported for inline TLS access. + assert(TargetOS::IsWindows); - // mov allocCtxReg, gs:[offsetOfThreadLocalStoragePointer] (TEB.ThreadLocalStoragePointer) - emit->emitIns_R_C(INS_mov, EA_PTRSIZE, allocCtxReg, FLD_GLOBAL_GS, - (int)allocInfo->offsetOfThreadLocalStoragePointer); + // Windows x64: gs:[0x58] -> TLS array -> [tls_index * 8] -> + offset + emit->emitIns_R_C(INS_mov, EA_PTRSIZE, allocCtxReg, FLD_GLOBAL_GS, + (int)allocInfo->offsetOfThreadLocalStoragePointer); - // mov tmpReg, [&_tls_index] -- load _tls_index value from its address - assert(allocInfo->tlsIndex.accessType == IAT_PVALUE); - instGen_Set_Reg_To_Imm(EA_PTRSIZE, tmpReg, (ssize_t)allocInfo->tlsIndex.addr); - emit->emitIns_R_AR(INS_mov, EA_4BYTE, tmpReg, tmpReg, 0); + // Load _tls_index value (accessType == IAT_PVALUE means addr points to the value) + assert(allocInfo->tlsIndex.accessType == IAT_PVALUE); + instGen_Set_Reg_To_Imm(EA_PTRSIZE, tmpReg, (ssize_t)allocInfo->tlsIndex.addr); + emit->emitIns_R_AR(INS_mov, EA_4BYTE, tmpReg, tmpReg, 0); - // mov allocCtxReg, [allocCtxReg + tmpReg * 8] (index into TLS array) - emit->emitIns_R_ARX(INS_mov, EA_PTRSIZE, allocCtxReg, allocCtxReg, tmpReg, 8, 0); + // Index into TLS array to get module's TLS block base + emit->emitIns_R_ARX(INS_mov, EA_PTRSIZE, allocCtxReg, allocCtxReg, tmpReg, 8, 0); - // lea allocCtxReg, [allocCtxReg + SECTIONREL t_runtime_thread_locals] - assert(allocInfo->tlsRoot.accessType == IAT_VALUE); - ssize_t secrelOffset = (ssize_t)allocInfo->tlsRoot.addr; - emit->emitIns_R_AR(INS_lea, EA_PTRSIZE, allocCtxReg, allocCtxReg, (int)secrelOffset); - } - else - { - // Linux/macOS: not yet implemented - EE should return supported=false - unreached(); - } + // Add byte offset from TLS base to t_runtime_thread_locals + assert(allocInfo->tlsRoot.accessType == IAT_VALUE); + ssize_t tlsRootOffset = (ssize_t)allocInfo->tlsRoot.addr; + emit->emitIns_R_AR(INS_lea, EA_PTRSIZE, allocCtxReg, allocCtxReg, (int)tlsRootOffset); - // ---- Disable GC for the allocation critical section ---- + // ---- Disable GC for the fast-path critical section ---- GetEmitter()->emitDisableGC(); // ---- Bump pointer allocation ---- - // Read baseSize from MethodTable FIRST, before loading alloc_ptr into dstReg. - // LSRA guarantees dstReg != mtReg (via DelayFree), but reading baseSize first - // keeps the sequence clean and matches the runtime helper's pattern. + // Read baseSize from MethodTable first, before loading alloc_ptr into dstReg. emit->emitIns_R_AR(INS_mov, EA_4BYTE, tmpReg, mtReg, (int)allocInfo->methodTableBaseSizeOffset); // dstReg = alloc_ptr (this will be the returned object pointer) @@ -1801,17 +1792,20 @@ void CodeGen::genCodeForAllocObj(GenTreeAllocObj* tree) emit->emitIns_R_AR(INS_cmp, EA_PTRSIZE, tmpReg, allocCtxReg, (int)allocInfo->combinedLimitFieldOffset); BasicBlock* slowPath = genCreateTempLabel(); - // If new alloc_ptr > combined_limit, go to slow path inst_JMP(EJ_ja, slowPath); // ---- Fast path: allocation succeeded ---- emit->emitIns_AR_R(INS_mov, EA_PTRSIZE, mtReg, dstReg, (int)allocInfo->objectMethodTableOffset); emit->emitIns_AR_R(INS_mov, EA_PTRSIZE, tmpReg, allocCtxReg, (int)allocInfo->allocPtrFieldOffset); + // End the non-GC-interruptible region before leaving the fast path. + GetEmitter()->emitEnableGC(); + BasicBlock* done = genCreateTempLabel(); inst_JMP(EJ_jmp, done); // ---- Slow path: call the allocation helper ---- + // This is in a GC-interruptible region so the helper call is a proper GC safe point. genDefineTempLabel(slowPath); regNumber mtArgReg = REG_ARG_0; @@ -1825,9 +1819,6 @@ void CodeGen::genCodeForAllocObj(GenTreeAllocObj* tree) // ---- Done ---- genDefineTempLabel(done); - // Re-enable GC after both paths converge - GetEmitter()->emitEnableGC(); - gcInfo.gcMarkRegPtrVal(dstReg, TYP_REF); genProduceReg(tree); } diff --git a/src/coreclr/jit/objectalloc.cpp b/src/coreclr/jit/objectalloc.cpp index aa828b3e314213..2b16385081d6c3 100644 --- a/src/coreclr/jit/objectalloc.cpp +++ b/src/coreclr/jit/objectalloc.cpp @@ -1340,8 +1340,9 @@ void ObjectAllocator::MorphAllocObjNode(AllocationCandidate& candidate) GenTree* const stmtExpr = candidate.m_tree; GenTreeAllocObj* allocObj = stmtExpr->AsLclVar()->Data()->AsAllocObj(); -#ifdef TARGET_XARCH - // Check if we can keep GT_ALLOCOBJ for inline allocation expansion in codegen +#ifdef TARGET_AMD64 + // Check if we can keep GT_ALLOCOBJ for inline allocation expansion in codegen. + // Currently only Windows x64 is supported. const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocCtxInfo = m_compiler->compGetAllocContextInfo(); if (allocObj->gtNewHelper == CORINFO_HELP_NEWSFAST && !allocObj->gtHelperHasSideEffects && allocCtxInfo->supported && TargetOS::IsWindows && m_compiler->opts.OptimizationEnabled() && @@ -1351,7 +1352,7 @@ void ObjectAllocator::MorphAllocObjNode(AllocationCandidate& candidate) m_compiler->dspTreeID(allocObj)); } else -#endif // TARGET_XARCH +#endif // TARGET_AMD64 { GenTree* const newData = MorphAllocObjNodeIntoHelperCall(allocObj); stmtExpr->AsLclVar()->Data() = newData; From 891526760043a8d0c3ec57dbce969b97261c51e0 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Thu, 12 Mar 2026 14:05:24 +0100 Subject: [PATCH 03/10] fix --- src/coreclr/jit/codegen.h | 1 + src/coreclr/jit/codegenxarch.cpp | 98 +++++++++++++++++++------------- src/coreclr/jit/gentree.cpp | 8 +++ src/coreclr/jit/gentree.h | 1 + src/coreclr/jit/gtlist.h | 2 +- src/coreclr/jit/lower.cpp | 6 -- src/coreclr/jit/lsrabuild.cpp | 4 -- src/coreclr/jit/lsraxarch.cpp | 19 ------- src/coreclr/jit/objectalloc.cpp | 9 ++- 9 files changed, 78 insertions(+), 70 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 99784f9cae8cb3..383b1950a630bd 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1152,6 +1152,7 @@ class CodeGen final : public CodeGenInterface void genCodeForStoreLclVar(GenTreeLclVar* tree); void genCodeForReturnTrap(GenTreeOp* tree); void genCodeForAllocObj(GenTreeAllocObj* tree); + void genInlineAllocCall(GenTreeCall* call); void genCodeForStoreInd(GenTreeStoreInd* tree); void genCodeForSwap(GenTreeOp* tree); void genCodeForCpObj(GenTreeBlk* cpObjNode); diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 5d4e90bc3fbee6..d6e4ba006d4e28 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1728,99 +1728,112 @@ void CodeGen::genCodeForReturnTrap(GenTreeOp* tree) //------------------------------------------------------------------------ // genCodeForAllocObj: Generate code for GT_ALLOCOBJ - inline object allocation. -// -// Emits an inline bump-pointer allocation fast path with a slow-path -// fallback to CORINFO_HELP_NEWSFAST. The fast path accesses the thread-local -// ee_alloc_context, checks if there's enough space, bumps the allocation -// pointer, and sets the MethodTable on the new object. If there isn't enough -// space, it falls through to the slow path which calls the runtime helper. -// -// The entire fast path (from reading alloc_ptr to updating it) is marked -// as non-GC-interruptible via emitDisableGC/emitEnableGC. +// GT_ALLOCOBJ should have been morphed to a helper call. This codepath should never be reached. // void CodeGen::genCodeForAllocObj(GenTreeAllocObj* tree) { - assert(tree->OperIs(GT_ALLOCOBJ)); + unreached(); +} +//------------------------------------------------------------------------ +// genInlineAllocCall: Expand a CORINFO_HELP_NEWSFAST call inline with a +// bump-pointer fast path and a slow-path fallback to the helper. +// +// The call node has been processed by genCallPlaceRegArgs already, so the +// MethodTable argument is in REG_ARG_0 (rcx on Windows). This function +// replaces the call emission with: +// 1. TLS access to get the ee_alloc_context +// 2. Bump-pointer allocation (non-GC-interruptible) +// 3. If allocation doesn't fit: fall through to the normal helper call +// +void CodeGen::genInlineAllocCall(GenTreeCall* call) +{ const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocInfo = m_compiler->compGetAllocContextInfo(); assert(allocInfo->supported); - regNumber dstReg = tree->GetRegNum(); - regNumber mtReg = genConsumeReg(tree->gtGetOp1()); + // The call infrastructure has already placed the MethodTable* in the arg register. + genCallPlaceRegArgs(call); + + regNumber dstReg = call->GetRegNum(); + regNumber mtReg = REG_ARG_0; // MethodTable* argument - // Get internal temp registers - regNumber allocCtxReg = internalRegisters.Extract(tree, RBM_ALLINT); - regNumber tmpReg = internalRegisters.Extract(tree, RBM_ALLINT); + // We need two scratch registers. Use caller-saved registers that + // won't conflict with mtReg or dstReg. + // After the helper call, only rax (return value) matters. + // Use r10 and r11 as scratch — they are caller-saved volatile registers + // not used for argument passing on Windows. + regNumber allocCtxReg = REG_R10; + regNumber tmpReg = REG_R11; emitter* emit = GetEmitter(); // ---- TLS access: get pointer to ee_alloc_context ---- - // Currently only Windows x64 is supported for inline TLS access. - assert(TargetOS::IsWindows); - - // Windows x64: gs:[0x58] -> TLS array -> [tls_index * 8] -> + offset emit->emitIns_R_C(INS_mov, EA_PTRSIZE, allocCtxReg, FLD_GLOBAL_GS, (int)allocInfo->offsetOfThreadLocalStoragePointer); - // Load _tls_index value (accessType == IAT_PVALUE means addr points to the value) assert(allocInfo->tlsIndex.accessType == IAT_PVALUE); instGen_Set_Reg_To_Imm(EA_PTRSIZE, tmpReg, (ssize_t)allocInfo->tlsIndex.addr); emit->emitIns_R_AR(INS_mov, EA_4BYTE, tmpReg, tmpReg, 0); - // Index into TLS array to get module's TLS block base emit->emitIns_R_ARX(INS_mov, EA_PTRSIZE, allocCtxReg, allocCtxReg, tmpReg, 8, 0); - // Add byte offset from TLS base to t_runtime_thread_locals assert(allocInfo->tlsRoot.accessType == IAT_VALUE); ssize_t tlsRootOffset = (ssize_t)allocInfo->tlsRoot.addr; emit->emitIns_R_AR(INS_lea, EA_PTRSIZE, allocCtxReg, allocCtxReg, (int)tlsRootOffset); - // ---- Disable GC for the fast-path critical section ---- + // ---- Non-GC-interruptible bump allocation ---- GetEmitter()->emitDisableGC(); - // ---- Bump pointer allocation ---- - // Read baseSize from MethodTable first, before loading alloc_ptr into dstReg. + // tmpReg = baseSize from MethodTable emit->emitIns_R_AR(INS_mov, EA_4BYTE, tmpReg, mtReg, (int)allocInfo->methodTableBaseSizeOffset); - // dstReg = alloc_ptr (this will be the returned object pointer) + // dstReg = alloc_ptr emit->emitIns_R_AR(INS_mov, EA_PTRSIZE, dstReg, allocCtxReg, (int)allocInfo->allocPtrFieldOffset); - // tmpReg = alloc_ptr + baseSize (potential new alloc_ptr) + // tmpReg = alloc_ptr + baseSize (new alloc_ptr) emit->emitIns_R_ARX(INS_lea, EA_PTRSIZE, tmpReg, dstReg, tmpReg, 1, 0); - // Compare new alloc_ptr against combined_limit + // Compare new alloc_ptr vs combined_limit emit->emitIns_R_AR(INS_cmp, EA_PTRSIZE, tmpReg, allocCtxReg, (int)allocInfo->combinedLimitFieldOffset); BasicBlock* slowPath = genCreateTempLabel(); inst_JMP(EJ_ja, slowPath); - // ---- Fast path: allocation succeeded ---- + // ---- Fast path ---- emit->emitIns_AR_R(INS_mov, EA_PTRSIZE, mtReg, dstReg, (int)allocInfo->objectMethodTableOffset); emit->emitIns_AR_R(INS_mov, EA_PTRSIZE, tmpReg, allocCtxReg, (int)allocInfo->allocPtrFieldOffset); - // End the non-GC-interruptible region before leaving the fast path. GetEmitter()->emitEnableGC(); BasicBlock* done = genCreateTempLabel(); inst_JMP(EJ_jmp, done); - // ---- Slow path: call the allocation helper ---- - // This is in a GC-interruptible region so the helper call is a proper GC safe point. + // ---- Slow path: emit the normal helper call ---- genDefineTempLabel(slowPath); - regNumber mtArgReg = REG_ARG_0; - inst_Mov(TYP_I_IMPL, mtArgReg, mtReg, /* canSkip */ true); - gcInfo.gcMarkRegSetNpt(genRegMask(mtArgReg)); - + // mtReg (REG_ARG_0/rcx) still holds the MethodTable* — call the helper directly. genEmitHelperCall(CORINFO_HELP_NEWSFAST, 0, EA_PTRSIZE); - inst_Mov(TYP_REF, dstReg, REG_INTRET, /* canSkip */ true); + // Helper returns the new object in rax. + if (dstReg != REG_INTRET) + { + inst_Mov(TYP_REF, dstReg, REG_INTRET, /* canSkip */ false); + } // ---- Done ---- genDefineTempLabel(done); gcInfo.gcMarkRegPtrVal(dstReg, TYP_REF); - genProduceReg(tree); + + // Move result to the call's destination register if different + if (call->GetRegNum() != dstReg) + { + inst_Mov(TYP_REF, call->GetRegNum(), dstReg, /* canSkip */ false); + gcInfo.gcMarkRegPtrVal(call->GetRegNum(), TYP_REF); + gcInfo.gcMarkRegSetNpt(genRegMask(dstReg)); + } + + genProduceReg(call); } /***************************************************************************** @@ -5975,6 +5988,15 @@ bool CodeGen::genEmitOptimizedGCWriteBarrier(GCInfo::WriteBarrierForm writeBarri // Produce code for a GT_CALL node void CodeGen::genCall(GenTreeCall* call) { +#ifdef TARGET_AMD64 + // Check if this is an allocation helper call marked for inline expansion + if ((call->gtCallMoreFlags & GTF_CALL_M_EXPAND_INLINE_ALLOC) != 0) + { + genInlineAllocCall(call); + return; + } +#endif + genAlignStackBeforeCall(call); // all virtuals should have been expanded into a control expression diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 3dfca0ba725a0a..9f60c5799c25eb 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -2846,6 +2846,14 @@ bool GenTree::Compare(GenTree* op1, GenTree* op2, bool swapOK) case GT_ARR_ADDR: break; + case GT_ALLOCOBJ: + if (op1->AsAllocObj()->gtNewHelper != op2->AsAllocObj()->gtNewHelper || + op1->AsAllocObj()->gtAllocObjClsHnd != op2->AsAllocObj()->gtAllocObjClsHnd) + { + return false; + } + break; + default: assert(!"unexpected unary ExOp operator"); } diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index ed72dd9f8c87a5..db1503864150b9 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -4391,6 +4391,7 @@ enum GenTreeCallFlags : unsigned int GTF_CALL_M_CAST_OBJ_NONNULL = 0x04000000, // if we expand this specific cast we don't need to check the input object for null // NOTE: if needed, this flag can be removed, and we can introduce new _NONNUL cast helpers GTF_CALL_M_STACK_ARRAY = 0x08000000, // this call is a new array helper for a stack allocated array. + GTF_CALL_M_EXPAND_INLINE_ALLOC = 0x10000000, // this allocation helper call should be expanded inline in codegen }; inline constexpr GenTreeCallFlags operator ~(GenTreeCallFlags a) diff --git a/src/coreclr/jit/gtlist.h b/src/coreclr/jit/gtlist.h index cdddcbdc1dbbf7..e8f61c1dea5263 100644 --- a/src/coreclr/jit/gtlist.h +++ b/src/coreclr/jit/gtlist.h @@ -91,7 +91,7 @@ GTNODE(ARR_LENGTH , GenTreeArrLen ,0,0,GTK_UNOP|GTK_EXOP) GTNODE(MDARR_LENGTH , GenTreeMDArr ,0,1,GTK_UNOP|GTK_EXOP) // multi-dimension (MD) array length of a specific dimension GTNODE(MDARR_LOWER_BOUND, GenTreeMDArr ,0,1,GTK_UNOP|GTK_EXOP) // multi-dimension (MD) array lower bound of a specific dimension GTNODE(FIELD_ADDR , GenTreeFieldAddr ,0,0,GTK_UNOP|GTK_EXOP|DBK_NOTLIR) // Field address -GTNODE(ALLOCOBJ , GenTreeAllocObj ,0,0,GTK_UNOP|GTK_EXOP) // object allocator +GTNODE(ALLOCOBJ , GenTreeAllocObj ,0,0,GTK_UNOP|GTK_EXOP|DBK_NOTLIR) // object allocator GTNODE(INIT_VAL , GenTreeOp ,0,1,GTK_UNOP) // Initialization value for an initBlk diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 48688ba7439702..7ad423c51ab822 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -492,12 +492,6 @@ GenTree* Lowering::LowerNode(GenTree* node) ContainCheckReturnTrap(node->AsOp()); break; -#ifdef TARGET_XARCH - case GT_ALLOCOBJ: - // GT_ALLOCOBJ operand (MethodTable handle) must be in a register - break; -#endif - case GT_CAST: { GenTree* nextNode = node->gtNext; diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index ba858c2e72b1a5..79ddcc5b1c732e 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -1066,10 +1066,6 @@ regMaskTP LinearScan::getKillSetForNode(GenTree* tree) killMask = m_compiler->compHelperCallKillSet(CORINFO_HELP_STOP_FOR_GC); break; - case GT_ALLOCOBJ: - killMask = m_compiler->compHelperCallKillSet(CORINFO_HELP_NEWSFAST); - break; - case GT_CALL: killMask = getKillSetForCall(tree->AsCall()); diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index db07c3e847fcef..22350a732d4ad0 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -315,25 +315,6 @@ int LinearScan::BuildNode(GenTree* tree) } break; - case GT_ALLOCOBJ: - { - // Inline object allocation: TLS access + bump pointer + slow-path helper call. - // Internal temps for TLS access intermediates and allocation calculations. - buildInternalIntRegisterDefForNode(tree); - buildInternalIntRegisterDefForNode(tree); - // The MethodTable operand must remain live until after we load alloc_ptr - // into the destination register (they may not alias). - srcCount = BuildDelayFreeUses(tree->gtGetOp1(), tree); - // Internal regs must not overlap with the destination register either, - // since allocCtxReg is used throughout the fast and slow paths. - setInternalRegsDelayFree = true; - buildInternalRegisterUses(); - killMask = m_compiler->compHelperCallKillSet(CORINFO_HELP_NEWSFAST); - BuildKills(tree, killMask); - BuildDef(tree); - } - break; - case GT_MOD: case GT_DIV: case GT_UMOD: diff --git a/src/coreclr/jit/objectalloc.cpp b/src/coreclr/jit/objectalloc.cpp index 2b16385081d6c3..c3dab1bb42e85a 100644 --- a/src/coreclr/jit/objectalloc.cpp +++ b/src/coreclr/jit/objectalloc.cpp @@ -1341,15 +1341,20 @@ void ObjectAllocator::MorphAllocObjNode(AllocationCandidate& candidate) GenTreeAllocObj* allocObj = stmtExpr->AsLclVar()->Data()->AsAllocObj(); #ifdef TARGET_AMD64 - // Check if we can keep GT_ALLOCOBJ for inline allocation expansion in codegen. + // Check if we can expand the allocation inline in codegen. // Currently only Windows x64 is supported. const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocCtxInfo = m_compiler->compGetAllocContextInfo(); if (allocObj->gtNewHelper == CORINFO_HELP_NEWSFAST && !allocObj->gtHelperHasSideEffects && allocCtxInfo->supported && TargetOS::IsWindows && m_compiler->opts.OptimizationEnabled() && JitConfig.JitInlineAllocFast() != 0) { - JITDUMP("Keeping GT_ALLOCOBJ [%06u] for inline allocation expansion\n", + JITDUMP("Marking allocation [%06u] for inline expansion\n", m_compiler->dspTreeID(allocObj)); + // Morph to helper call, but mark it for inline expansion in codegen + GenTree* const newData = MorphAllocObjNodeIntoHelperCall(allocObj); + newData->AsCall()->gtCallMoreFlags |= GTF_CALL_M_EXPAND_INLINE_ALLOC; + stmtExpr->AsLclVar()->Data() = newData; + stmtExpr->AddAllEffectsFlags(newData); } else #endif // TARGET_AMD64 From ae052b06c3dd328956a87c0a595f2ffb38826e19 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Thu, 12 Mar 2026 14:12:15 +0100 Subject: [PATCH 04/10] enable for linux --- src/coreclr/jit/codegenxarch.cpp | 55 +++++++++++++++++++++++--------- src/coreclr/jit/objectalloc.cpp | 3 +- 2 files changed, 42 insertions(+), 16 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index d6e4ba006d4e28..2b51bc3d7c74a8 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1755,31 +1755,56 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) genCallPlaceRegArgs(call); regNumber dstReg = call->GetRegNum(); - regNumber mtReg = REG_ARG_0; // MethodTable* argument + regNumber mtReg = REG_ARG_0; // MethodTable* argument (rcx on Windows, rdi on Linux) - // We need two scratch registers. Use caller-saved registers that - // won't conflict with mtReg or dstReg. - // After the helper call, only rax (return value) matters. - // Use r10 and r11 as scratch — they are caller-saved volatile registers - // not used for argument passing on Windows. + // We use r10 and r11 as scratch — they are caller-saved volatile registers + // not used for argument passing on either Windows or SysV. regNumber allocCtxReg = REG_R10; regNumber tmpReg = REG_R11; emitter* emit = GetEmitter(); // ---- TLS access: get pointer to ee_alloc_context ---- - emit->emitIns_R_C(INS_mov, EA_PTRSIZE, allocCtxReg, FLD_GLOBAL_GS, - (int)allocInfo->offsetOfThreadLocalStoragePointer); + if (TargetOS::IsWindows) + { + // Windows x64: gs:[0x58] -> TLS array -> [tls_index * 8] -> + offset + emit->emitIns_R_C(INS_mov, EA_PTRSIZE, allocCtxReg, FLD_GLOBAL_GS, + (int)allocInfo->offsetOfThreadLocalStoragePointer); + + assert(allocInfo->tlsIndex.accessType == IAT_PVALUE); + instGen_Set_Reg_To_Imm(EA_PTRSIZE, tmpReg, (ssize_t)allocInfo->tlsIndex.addr); + emit->emitIns_R_AR(INS_mov, EA_4BYTE, tmpReg, tmpReg, 0); - assert(allocInfo->tlsIndex.accessType == IAT_PVALUE); - instGen_Set_Reg_To_Imm(EA_PTRSIZE, tmpReg, (ssize_t)allocInfo->tlsIndex.addr); - emit->emitIns_R_AR(INS_mov, EA_4BYTE, tmpReg, tmpReg, 0); + emit->emitIns_R_ARX(INS_mov, EA_PTRSIZE, allocCtxReg, allocCtxReg, tmpReg, 8, 0); - emit->emitIns_R_ARX(INS_mov, EA_PTRSIZE, allocCtxReg, allocCtxReg, tmpReg, 8, 0); + assert(allocInfo->tlsRoot.accessType == IAT_VALUE); + ssize_t tlsRootOffset = (ssize_t)allocInfo->tlsRoot.addr; + emit->emitIns_R_AR(INS_lea, EA_PTRSIZE, allocCtxReg, allocCtxReg, (int)tlsRootOffset); + } + else + { + // Linux x64: call __tls_get_addr with the pre-resolved TLSGD descriptor. + // __tls_get_addr(descriptor) returns the address of t_runtime_thread_locals. + // The call clobbers all caller-saved registers, so save the MT pointer on the stack. - assert(allocInfo->tlsRoot.accessType == IAT_VALUE); - ssize_t tlsRootOffset = (ssize_t)allocInfo->tlsRoot.addr; - emit->emitIns_R_AR(INS_lea, EA_PTRSIZE, allocCtxReg, allocCtxReg, (int)tlsRootOffset); + emit->emitIns_R(INS_push, EA_PTRSIZE, mtReg); + + // Load the TLSGD descriptor address into rdi (first arg for __tls_get_addr on SysV) + assert(allocInfo->tlsRoot.accessType == IAT_VALUE); + instGen_Set_Reg_To_Imm(EA_PTRSIZE, REG_ARG_0, (ssize_t)allocInfo->tlsRoot.addr); + + // Call __tls_get_addr — result in rax + assert(allocInfo->tlsGetAddrFtnPtr != nullptr); + instGen_Set_Reg_To_Imm(EA_PTRSIZE, REG_R11, (ssize_t)allocInfo->tlsGetAddrFtnPtr); + emit->emitIns_R(INS_call, EA_PTRSIZE, REG_R11); + + // rax = address of t_runtime_thread_locals; move to r10 for allocCtxReg + emit->emitIns_Mov(INS_mov, EA_PTRSIZE, REG_R10, REG_RAX, /* canSkip */ false); + allocCtxReg = REG_R10; + + // Restore the MethodTable pointer + emit->emitIns_R(INS_pop, EA_PTRSIZE, mtReg); + } // ---- Non-GC-interruptible bump allocation ---- GetEmitter()->emitDisableGC(); diff --git a/src/coreclr/jit/objectalloc.cpp b/src/coreclr/jit/objectalloc.cpp index c3dab1bb42e85a..9d9911fb8c4104 100644 --- a/src/coreclr/jit/objectalloc.cpp +++ b/src/coreclr/jit/objectalloc.cpp @@ -1345,7 +1345,8 @@ void ObjectAllocator::MorphAllocObjNode(AllocationCandidate& candidate) // Currently only Windows x64 is supported. const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocCtxInfo = m_compiler->compGetAllocContextInfo(); if (allocObj->gtNewHelper == CORINFO_HELP_NEWSFAST && !allocObj->gtHelperHasSideEffects && - allocCtxInfo->supported && TargetOS::IsWindows && m_compiler->opts.OptimizationEnabled() && + allocCtxInfo->supported && (TargetOS::IsWindows || TargetOS::IsUnix) && + m_compiler->opts.OptimizationEnabled() && JitConfig.JitInlineAllocFast() != 0) { JITDUMP("Marking allocation [%06u] for inline expansion\n", From e54f5d2fd9bd9e8442d8a9618f30907ba8bac368 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Thu, 12 Mar 2026 14:37:36 +0100 Subject: [PATCH 05/10] feedback --- src/coreclr/jit/codegen.h | 2 ++ src/coreclr/jit/codegenxarch.cpp | 27 +++++++++++-------- src/coreclr/jit/objectalloc.cpp | 8 +++--- .../tools/Common/JitInterface/CorInfoImpl.cs | 2 +- 4 files changed, 22 insertions(+), 17 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 383b1950a630bd..f9a505677a94da 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1152,7 +1152,9 @@ class CodeGen final : public CodeGenInterface void genCodeForStoreLclVar(GenTreeLclVar* tree); void genCodeForReturnTrap(GenTreeOp* tree); void genCodeForAllocObj(GenTreeAllocObj* tree); +#ifdef TARGET_AMD64 void genInlineAllocCall(GenTreeCall* call); +#endif void genCodeForStoreInd(GenTreeStoreInd* tree); void genCodeForSwap(GenTreeOp* tree); void genCodeForCpObj(GenTreeBlk* cpObjNode); diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 2b51bc3d7c74a8..dee4a276d1c6a6 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1746,6 +1746,7 @@ void CodeGen::genCodeForAllocObj(GenTreeAllocObj* tree) // 2. Bump-pointer allocation (non-GC-interruptible) // 3. If allocation doesn't fit: fall through to the normal helper call // +#ifdef TARGET_AMD64 void CodeGen::genInlineAllocCall(GenTreeCall* call) { const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocInfo = m_compiler->compGetAllocContextInfo(); @@ -1755,7 +1756,7 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) genCallPlaceRegArgs(call); regNumber dstReg = call->GetRegNum(); - regNumber mtReg = REG_ARG_0; // MethodTable* argument (rcx on Windows, rdi on Linux) + regNumber mtReg = REG_ARG_0; // MethodTable* argument (rcx on Windows, rdi on Linux) // We use r10 and r11 as scratch — they are caller-saved volatile registers // not used for argument passing on either Windows or SysV. @@ -1806,19 +1807,13 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) emit->emitIns_R(INS_pop, EA_PTRSIZE, mtReg); } - // ---- Non-GC-interruptible bump allocation ---- + // ---- Bump allocation (non-GC-interruptible) ---- GetEmitter()->emitDisableGC(); - // tmpReg = baseSize from MethodTable + // Size not known at JIT time — read from MethodTable.m_BaseSize at runtime. emit->emitIns_R_AR(INS_mov, EA_4BYTE, tmpReg, mtReg, (int)allocInfo->methodTableBaseSizeOffset); - - // dstReg = alloc_ptr emit->emitIns_R_AR(INS_mov, EA_PTRSIZE, dstReg, allocCtxReg, (int)allocInfo->allocPtrFieldOffset); - - // tmpReg = alloc_ptr + baseSize (new alloc_ptr) emit->emitIns_R_ARX(INS_lea, EA_PTRSIZE, tmpReg, dstReg, tmpReg, 1, 0); - - // Compare new alloc_ptr vs combined_limit emit->emitIns_R_AR(INS_cmp, EA_PTRSIZE, tmpReg, allocCtxReg, (int)allocInfo->combinedLimitFieldOffset); BasicBlock* slowPath = genCreateTempLabel(); @@ -1828,15 +1823,18 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) emit->emitIns_AR_R(INS_mov, EA_PTRSIZE, mtReg, dstReg, (int)allocInfo->objectMethodTableOffset); emit->emitIns_AR_R(INS_mov, EA_PTRSIZE, tmpReg, allocCtxReg, (int)allocInfo->allocPtrFieldOffset); + // End the no-GC region. This creates an IG boundary; all subsequent IGs + // (including the slow path after the label below) are GC-interruptible. GetEmitter()->emitEnableGC(); BasicBlock* done = genCreateTempLabel(); inst_JMP(EJ_jmp, done); - // ---- Slow path: emit the normal helper call ---- + // ---- Slow path: call the allocation helper ---- + // This IG is GC-interruptible (emitEnableGC was called above, and the label + // starts a new IG inheriting that state). The helper call is a GC safe point. genDefineTempLabel(slowPath); - // mtReg (REG_ARG_0/rcx) still holds the MethodTable* — call the helper directly. genEmitHelperCall(CORINFO_HELP_NEWSFAST, 0, EA_PTRSIZE); // Helper returns the new object in rax. @@ -1860,6 +1858,7 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) genProduceReg(call); } +#endif // TARGET_AMD64 /***************************************************************************** * @@ -6017,6 +6016,12 @@ void CodeGen::genCall(GenTreeCall* call) // Check if this is an allocation helper call marked for inline expansion if ((call->gtCallMoreFlags & GTF_CALL_M_EXPAND_INLINE_ALLOC) != 0) { + // Handle AVX/SSE transition before the slow-path helper call + if (GetEmitter()->Contains256bitOrMoreAVX() && call->NeedsVzeroupper(m_compiler)) + { + instGen(INS_vzeroupper); + } + genInlineAllocCall(call); return; } diff --git a/src/coreclr/jit/objectalloc.cpp b/src/coreclr/jit/objectalloc.cpp index 9d9911fb8c4104..9bfe66f3d3a65d 100644 --- a/src/coreclr/jit/objectalloc.cpp +++ b/src/coreclr/jit/objectalloc.cpp @@ -1346,13 +1346,11 @@ void ObjectAllocator::MorphAllocObjNode(AllocationCandidate& candidate) const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocCtxInfo = m_compiler->compGetAllocContextInfo(); if (allocObj->gtNewHelper == CORINFO_HELP_NEWSFAST && !allocObj->gtHelperHasSideEffects && allocCtxInfo->supported && (TargetOS::IsWindows || TargetOS::IsUnix) && - m_compiler->opts.OptimizationEnabled() && - JitConfig.JitInlineAllocFast() != 0) + m_compiler->opts.OptimizationEnabled() && JitConfig.JitInlineAllocFast() != 0) { - JITDUMP("Marking allocation [%06u] for inline expansion\n", - m_compiler->dspTreeID(allocObj)); + JITDUMP("Marking allocation [%06u] for inline expansion\n", m_compiler->dspTreeID(allocObj)); // Morph to helper call, but mark it for inline expansion in codegen - GenTree* const newData = MorphAllocObjNodeIntoHelperCall(allocObj); + GenTree* const newData = MorphAllocObjNodeIntoHelperCall(allocObj); newData->AsCall()->gtCallMoreFlags |= GTF_CALL_M_EXPAND_INLINE_ALLOC; stmtExpr->AsLclVar()->Data() = newData; stmtExpr->AddAllEffectsFlags(newData); diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs b/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs index d133bc0acff459..d7c2ca5de8f257 100644 --- a/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs +++ b/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs @@ -3715,7 +3715,7 @@ private uint getThreadTLSIndex(ref void* ppIndirection) private void getObjectAllocContextInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) { // NativeAOT/crossgen2: not yet implemented - pInfo->supported = 0; + *pInfo = default; } private Dictionary _helperCache = new Dictionary(); From 6da6690988980b2af4408200051b86dae5c2d426 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Thu, 12 Mar 2026 15:28:45 +0100 Subject: [PATCH 06/10] add array allocators --- src/coreclr/inc/corinfo.h | 9 +++- src/coreclr/inc/jiteeversionguid.h | 10 ++-- src/coreclr/jit/codegenxarch.cpp | 50 +++++++++++++------ src/coreclr/jit/objectalloc.cpp | 43 ++++++++++++++++ .../tools/Common/JitInterface/CorInfoTypes.cs | 3 ++ .../tools/superpmi/superpmi-shared/agnostic.h | 3 ++ .../superpmi-shared/methodcontext.cpp | 6 +++ src/coreclr/vm/jitinterface.cpp | 5 ++ 8 files changed, 106 insertions(+), 23 deletions(-) diff --git a/src/coreclr/inc/corinfo.h b/src/coreclr/inc/corinfo.h index e28b2a431de9f7..746aeaf1c00129 100644 --- a/src/coreclr/inc/corinfo.h +++ b/src/coreclr/inc/corinfo.h @@ -1712,8 +1712,13 @@ struct CORINFO_OBJECT_ALLOC_CONTEXT_INFO uint32_t combinedLimitFieldOffset; // Offset of combined_limit // Object/MethodTable layout offsets - uint32_t objectMethodTableOffset; // Offset of MethodTable* in Object - uint32_t methodTableBaseSizeOffset; // Offset of m_BaseSize in MethodTable + uint32_t objectMethodTableOffset; // Offset of MethodTable* in Object (0) + uint32_t methodTableBaseSizeOffset; // Offset of m_BaseSize in MethodTable (4) + + // Array layout info + uint32_t arrayLengthOffset; // Offset of m_NumComponents in ArrayBase (8) + uint32_t arrayBaseSize; // Fixed overhead for SZ arrays (SZARRAY_BASE_SIZE, 0x18) + uint32_t methodTableComponentSizeOffset; // Offset of component size in MethodTable (0, low 16 bits of m_dwFlags) // TLS access info (platform-specific) CORINFO_CONST_LOOKUP tlsIndex; // Windows: address of _tls_index (IAT_PVALUE) diff --git a/src/coreclr/inc/jiteeversionguid.h b/src/coreclr/inc/jiteeversionguid.h index 2edea7be1c5565..7406edecc6dbf6 100644 --- a/src/coreclr/inc/jiteeversionguid.h +++ b/src/coreclr/inc/jiteeversionguid.h @@ -37,11 +37,11 @@ #include -constexpr GUID JITEEVersionIdentifier = { /* 4383dd79-4927-4fee-a314-84cff6e87501 */ - 0x4383dd79, - 0x4927, - 0x4fee, - {0xa3, 0x14, 0x84, 0xcf, 0xf6, 0xe8, 0x75, 0x01} +constexpr GUID JITEEVersionIdentifier = { /* 7b2c0eb5-6677-4c72-bbf3-f9d32c55a6b7 */ + 0x7b2c0eb5, + 0x6677, + 0x4c72, + {0xbb, 0xf3, 0xf9, 0xd3, 0x2c, 0x55, 0xa6, 0xb7} }; #endif // JIT_EE_VERSIONING_GUID_H diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index dee4a276d1c6a6..420dde0946bfdd 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1752,11 +1752,15 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocInfo = m_compiler->compGetAllocContextInfo(); assert(allocInfo->supported); - // The call infrastructure has already placed the MethodTable* in the arg register. + // Determine if this is an object or array allocation. + CorInfoHelpFunc helperNum = call->GetHelperNum(); + bool isArray = (helperNum == CORINFO_HELP_NEWARR_1_VC || helperNum == CORINFO_HELP_NEWARR_1_PTR); + genCallPlaceRegArgs(call); regNumber dstReg = call->GetRegNum(); - regNumber mtReg = REG_ARG_0; // MethodTable* argument (rcx on Windows, rdi on Linux) + regNumber mtReg = REG_ARG_0; + regNumber lenReg = isArray ? REG_ARG_1 : REG_NA; // We use r10 and r11 as scratch — they are caller-saved volatile registers // not used for argument passing on either Windows or SysV. @@ -1785,33 +1789,47 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) else { // Linux x64: call __tls_get_addr with the pre-resolved TLSGD descriptor. - // __tls_get_addr(descriptor) returns the address of t_runtime_thread_locals. - // The call clobbers all caller-saved registers, so save the MT pointer on the stack. + // The call clobbers all caller-saved registers, so save args on the stack. emit->emitIns_R(INS_push, EA_PTRSIZE, mtReg); + if (isArray) + { + emit->emitIns_R(INS_push, EA_PTRSIZE, lenReg); + } - // Load the TLSGD descriptor address into rdi (first arg for __tls_get_addr on SysV) assert(allocInfo->tlsRoot.accessType == IAT_VALUE); instGen_Set_Reg_To_Imm(EA_PTRSIZE, REG_ARG_0, (ssize_t)allocInfo->tlsRoot.addr); - // Call __tls_get_addr — result in rax assert(allocInfo->tlsGetAddrFtnPtr != nullptr); instGen_Set_Reg_To_Imm(EA_PTRSIZE, REG_R11, (ssize_t)allocInfo->tlsGetAddrFtnPtr); emit->emitIns_R(INS_call, EA_PTRSIZE, REG_R11); - // rax = address of t_runtime_thread_locals; move to r10 for allocCtxReg emit->emitIns_Mov(INS_mov, EA_PTRSIZE, REG_R10, REG_RAX, /* canSkip */ false); allocCtxReg = REG_R10; - // Restore the MethodTable pointer + if (isArray) + { + emit->emitIns_R(INS_pop, EA_PTRSIZE, lenReg); + } emit->emitIns_R(INS_pop, EA_PTRSIZE, mtReg); } // ---- Bump allocation (non-GC-interruptible) ---- GetEmitter()->emitDisableGC(); - // Size not known at JIT time — read from MethodTable.m_BaseSize at runtime. - emit->emitIns_R_AR(INS_mov, EA_4BYTE, tmpReg, mtReg, (int)allocInfo->methodTableBaseSizeOffset); + if (isArray) + { + // Array total size = ALIGN8(SZARRAY_BASE_SIZE + elementCount * componentSize) + emit->emitIns_R_AR(INS_mov, EA_2BYTE, tmpReg, mtReg, (int)allocInfo->methodTableComponentSizeOffset); + emit->emitIns_R_R(INS_imul, EA_PTRSIZE, tmpReg, lenReg); + emit->emitIns_R_I(INS_add, EA_PTRSIZE, tmpReg, (ssize_t)(allocInfo->arrayBaseSize + 7)); + emit->emitIns_R_I(INS_and, EA_PTRSIZE, tmpReg, -8); + } + else + { + emit->emitIns_R_AR(INS_mov, EA_4BYTE, tmpReg, mtReg, (int)allocInfo->methodTableBaseSizeOffset); + } + emit->emitIns_R_AR(INS_mov, EA_PTRSIZE, dstReg, allocCtxReg, (int)allocInfo->allocPtrFieldOffset); emit->emitIns_R_ARX(INS_lea, EA_PTRSIZE, tmpReg, dstReg, tmpReg, 1, 0); emit->emitIns_R_AR(INS_cmp, EA_PTRSIZE, tmpReg, allocCtxReg, (int)allocInfo->combinedLimitFieldOffset); @@ -1821,21 +1839,21 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) // ---- Fast path ---- emit->emitIns_AR_R(INS_mov, EA_PTRSIZE, mtReg, dstReg, (int)allocInfo->objectMethodTableOffset); + if (isArray) + { + emit->emitIns_AR_R(INS_mov, EA_4BYTE, lenReg, dstReg, (int)allocInfo->arrayLengthOffset); + } emit->emitIns_AR_R(INS_mov, EA_PTRSIZE, tmpReg, allocCtxReg, (int)allocInfo->allocPtrFieldOffset); - // End the no-GC region. This creates an IG boundary; all subsequent IGs - // (including the slow path after the label below) are GC-interruptible. GetEmitter()->emitEnableGC(); BasicBlock* done = genCreateTempLabel(); inst_JMP(EJ_jmp, done); - // ---- Slow path: call the allocation helper ---- - // This IG is GC-interruptible (emitEnableGC was called above, and the label - // starts a new IG inheriting that state). The helper call is a GC safe point. + // ---- Slow path ---- genDefineTempLabel(slowPath); - genEmitHelperCall(CORINFO_HELP_NEWSFAST, 0, EA_PTRSIZE); + genEmitHelperCall(helperNum, 0, EA_PTRSIZE); // Helper returns the new object in rax. if (dstReg != REG_INTRET) diff --git a/src/coreclr/jit/objectalloc.cpp b/src/coreclr/jit/objectalloc.cpp index 9bfe66f3d3a65d..2270ff5471daf1 100644 --- a/src/coreclr/jit/objectalloc.cpp +++ b/src/coreclr/jit/objectalloc.cpp @@ -1267,6 +1267,26 @@ bool ObjectAllocator::MorphAllocObjNodes() if (allocType == OAT_NONE) { +#ifdef TARGET_AMD64 + // Even if escape analysis doesn't apply, mark eligible array allocation + // calls for inline expansion in codegen. + if (data->IsHelperCall() && m_compiler->opts.OptimizationEnabled() && + JitConfig.JitInlineAllocFast() != 0) + { + GenTreeCall* const call = data->AsCall(); + CorInfoHelpFunc helper = call->GetHelperNum(); + if (helper == CORINFO_HELP_NEWARR_1_VC || helper == CORINFO_HELP_NEWARR_1_PTR) + { + const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocCtxInfo = m_compiler->compGetAllocContextInfo(); + if (allocCtxInfo->supported && (TargetOS::IsWindows || TargetOS::IsUnix)) + { + JITDUMP("Marking array allocation [%06u] for inline expansion\n", + m_compiler->dspTreeID(call)); + call->gtCallMoreFlags |= GTF_CALL_M_EXPAND_INLINE_ALLOC; + } + } + } +#endif continue; } @@ -1364,6 +1384,29 @@ void ObjectAllocator::MorphAllocObjNode(AllocationCandidate& candidate) } } +#ifdef TARGET_AMD64 + // Mark non-stack-allocated array calls for inline expansion + if (candidate.m_allocType == OAT_NEWARR) + { + GenTree* const data = candidate.m_tree->AsLclVar()->Data(); + if (data->IsHelperCall()) + { + GenTreeCall* const call = data->AsCall(); + CorInfoHelpFunc helper = call->GetHelperNum(); + if ((helper == CORINFO_HELP_NEWARR_1_VC || helper == CORINFO_HELP_NEWARR_1_PTR) && + m_compiler->opts.OptimizationEnabled() && JitConfig.JitInlineAllocFast() != 0) + { + const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocCtxInfo = m_compiler->compGetAllocContextInfo(); + if (allocCtxInfo->supported && (TargetOS::IsWindows || TargetOS::IsUnix)) + { + JITDUMP("Marking array allocation [%06u] for inline expansion\n", m_compiler->dspTreeID(call)); + call->gtCallMoreFlags |= GTF_CALL_M_EXPAND_INLINE_ALLOC; + } + } + } + } +#endif + if (IsTrackedLocal(lclNum)) { AddConnGraphEdgeIndex(LocalToIndex(lclNum), m_unknownSourceIndex); diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs b/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs index f439f2d9c6a61c..bcbf9567eb81ac 100644 --- a/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs +++ b/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs @@ -1283,6 +1283,9 @@ public unsafe struct CORINFO_OBJECT_ALLOC_CONTEXT_INFO public uint combinedLimitFieldOffset; public uint objectMethodTableOffset; public uint methodTableBaseSizeOffset; + public uint arrayLengthOffset; + public uint arrayBaseSize; + public uint methodTableComponentSizeOffset; public CORINFO_CONST_LOOKUP tlsIndex; public uint offsetOfThreadLocalStoragePointer; public CORINFO_CONST_LOOKUP tlsRoot; diff --git a/src/coreclr/tools/superpmi/superpmi-shared/agnostic.h b/src/coreclr/tools/superpmi/superpmi-shared/agnostic.h index a67e38cb41c1e9..d95aa129c7dd30 100644 --- a/src/coreclr/tools/superpmi/superpmi-shared/agnostic.h +++ b/src/coreclr/tools/superpmi/superpmi-shared/agnostic.h @@ -579,6 +579,9 @@ struct Agnostic_GetObjectAllocContextInfo DWORD combinedLimitFieldOffset; DWORD objectMethodTableOffset; DWORD methodTableBaseSizeOffset; + DWORD arrayLengthOffset; + DWORD arrayBaseSize; + DWORD methodTableComponentSizeOffset; Agnostic_CORINFO_CONST_LOOKUP tlsIndex; DWORD offsetOfThreadLocalStoragePointer; Agnostic_CORINFO_CONST_LOOKUP tlsRoot; diff --git a/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.cpp b/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.cpp index b69252329d83f2..010e116a45bc20 100644 --- a/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.cpp +++ b/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.cpp @@ -7825,6 +7825,9 @@ void MethodContext::recGetObjectAllocContextInfo(CORINFO_OBJECT_ALLOC_CONTEXT_IN value.combinedLimitFieldOffset = pInfo->combinedLimitFieldOffset; value.objectMethodTableOffset = pInfo->objectMethodTableOffset; value.methodTableBaseSizeOffset = pInfo->methodTableBaseSizeOffset; + value.arrayLengthOffset = pInfo->arrayLengthOffset; + value.arrayBaseSize = pInfo->arrayBaseSize; + value.methodTableComponentSizeOffset = pInfo->methodTableComponentSizeOffset; value.tlsIndex = SpmiRecordsHelper::StoreAgnostic_CORINFO_CONST_LOOKUP(&pInfo->tlsIndex); value.offsetOfThreadLocalStoragePointer = pInfo->offsetOfThreadLocalStoragePointer; value.tlsRoot = SpmiRecordsHelper::StoreAgnostic_CORINFO_CONST_LOOKUP(&pInfo->tlsRoot); @@ -7865,6 +7868,9 @@ void MethodContext::repGetObjectAllocContextInfo(CORINFO_OBJECT_ALLOC_CONTEXT_IN pInfo->combinedLimitFieldOffset = value.combinedLimitFieldOffset; pInfo->objectMethodTableOffset = value.objectMethodTableOffset; pInfo->methodTableBaseSizeOffset = value.methodTableBaseSizeOffset; + pInfo->arrayLengthOffset = value.arrayLengthOffset; + pInfo->arrayBaseSize = value.arrayBaseSize; + pInfo->methodTableComponentSizeOffset = value.methodTableComponentSizeOffset; pInfo->tlsIndex = SpmiRecordsHelper::RestoreCORINFO_CONST_LOOKUP(value.tlsIndex); pInfo->offsetOfThreadLocalStoragePointer = value.offsetOfThreadLocalStoragePointer; pInfo->tlsRoot = SpmiRecordsHelper::RestoreCORINFO_CONST_LOOKUP(value.tlsRoot); diff --git a/src/coreclr/vm/jitinterface.cpp b/src/coreclr/vm/jitinterface.cpp index f69608d203960f..523a078990605a 100644 --- a/src/coreclr/vm/jitinterface.cpp +++ b/src/coreclr/vm/jitinterface.cpp @@ -1380,6 +1380,11 @@ void CEEInfo::getObjectAllocContextInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo pInfo->objectMethodTableOffset = (uint32_t)cdac_data::m_pMethTab; pInfo->methodTableBaseSizeOffset = (uint32_t)cdac_data::BaseSize; + // Array layout + pInfo->arrayLengthOffset = (uint32_t)cdac_data::m_NumComponents; + pInfo->arrayBaseSize = ARRAYBASE_BASESIZE; + pInfo->methodTableComponentSizeOffset = (uint32_t)cdac_data::MTFlags;// component size is low 16 bits of m_dwFlags + // TLS access info - how to reach t_runtime_thread_locals GetObjectAllocContextTlsInfo(pInfo); } From 347f5b701f308f02897825d0c56d87cd73acf576 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Thu, 12 Mar 2026 15:55:21 +0100 Subject: [PATCH 07/10] feedback --- src/coreclr/jit/codegenxarch.cpp | 54 ++++++++++++++++++++------------ src/coreclr/jit/objectalloc.cpp | 11 +++---- src/coreclr/vm/threadstatics.cpp | 8 ++--- 3 files changed, 43 insertions(+), 30 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 420dde0946bfdd..e12568a791d7bb 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1752,7 +1752,6 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocInfo = m_compiler->compGetAllocContextInfo(); assert(allocInfo->supported); - // Determine if this is an object or array allocation. CorInfoHelpFunc helperNum = call->GetHelperNum(); bool isArray = (helperNum == CORINFO_HELP_NEWARR_1_VC || helperNum == CORINFO_HELP_NEWARR_1_PTR); @@ -1762,8 +1761,6 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) regNumber mtReg = REG_ARG_0; regNumber lenReg = isArray ? REG_ARG_1 : REG_NA; - // We use r10 and r11 as scratch — they are caller-saved volatile registers - // not used for argument passing on either Windows or SysV. regNumber allocCtxReg = REG_R10; regNumber tmpReg = REG_R11; @@ -1772,30 +1769,30 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) // ---- TLS access: get pointer to ee_alloc_context ---- if (TargetOS::IsWindows) { - // Windows x64: gs:[0x58] -> TLS array -> [tls_index * 8] -> + offset emit->emitIns_R_C(INS_mov, EA_PTRSIZE, allocCtxReg, FLD_GLOBAL_GS, (int)allocInfo->offsetOfThreadLocalStoragePointer); - assert(allocInfo->tlsIndex.accessType == IAT_PVALUE); - instGen_Set_Reg_To_Imm(EA_PTRSIZE, tmpReg, (ssize_t)allocInfo->tlsIndex.addr); - emit->emitIns_R_AR(INS_mov, EA_4BYTE, tmpReg, tmpReg, 0); + assert(allocInfo->tlsIndex.accessType == IAT_VALUE); + instGen_Set_Reg_To_Imm(EA_4BYTE, tmpReg, (ssize_t)allocInfo->tlsIndex.addr); emit->emitIns_R_ARX(INS_mov, EA_PTRSIZE, allocCtxReg, allocCtxReg, tmpReg, 8, 0); assert(allocInfo->tlsRoot.accessType == IAT_VALUE); - ssize_t tlsRootOffset = (ssize_t)allocInfo->tlsRoot.addr; - emit->emitIns_R_AR(INS_lea, EA_PTRSIZE, allocCtxReg, allocCtxReg, (int)tlsRootOffset); + emit->emitIns_R_AR(INS_lea, EA_PTRSIZE, allocCtxReg, allocCtxReg, (int)(ssize_t)allocInfo->tlsRoot.addr); } else { - // Linux x64: call __tls_get_addr with the pre-resolved TLSGD descriptor. - // The call clobbers all caller-saved registers, so save args on the stack. - + // Linux x64: call __tls_get_addr. Save arg registers on the stack. + // Always push an even number of 8-byte values for 16-byte stack alignment. emit->emitIns_R(INS_push, EA_PTRSIZE, mtReg); if (isArray) { emit->emitIns_R(INS_push, EA_PTRSIZE, lenReg); } + else + { + emit->emitIns_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, 8); + } assert(allocInfo->tlsRoot.accessType == IAT_VALUE); instGen_Set_Reg_To_Imm(EA_PTRSIZE, REG_ARG_0, (ssize_t)allocInfo->tlsRoot.addr); @@ -1811,16 +1808,26 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) { emit->emitIns_R(INS_pop, EA_PTRSIZE, lenReg); } + else + { + emit->emitIns_R_I(INS_add, EA_PTRSIZE, REG_SPBASE, 8); + } emit->emitIns_R(INS_pop, EA_PTRSIZE, mtReg); } + BasicBlock* slowPath = genCreateTempLabel(); + // ---- Bump allocation (non-GC-interruptible) ---- GetEmitter()->emitDisableGC(); if (isArray) { - // Array total size = ALIGN8(SZARRAY_BASE_SIZE + elementCount * componentSize) - emit->emitIns_R_AR(INS_mov, EA_2BYTE, tmpReg, mtReg, (int)allocInfo->methodTableComponentSizeOffset); + // Validate element count: must be in [0, 0x7FFFFFFF]. + emit->emitIns_R_I(INS_cmp, EA_PTRSIZE, lenReg, 0x7FFFFFFF); + inst_JMP(EJ_ja, slowPath); + + // Array total size = ALIGN8(arrayBaseSize + elementCount * componentSize) + emit->emitIns_R_AR(INS_movzx, EA_2BYTE, tmpReg, mtReg, (int)allocInfo->methodTableComponentSizeOffset); emit->emitIns_R_R(INS_imul, EA_PTRSIZE, tmpReg, lenReg); emit->emitIns_R_I(INS_add, EA_PTRSIZE, tmpReg, (ssize_t)(allocInfo->arrayBaseSize + 7)); emit->emitIns_R_I(INS_and, EA_PTRSIZE, tmpReg, -8); @@ -1830,20 +1837,27 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) emit->emitIns_R_AR(INS_mov, EA_4BYTE, tmpReg, mtReg, (int)allocInfo->methodTableBaseSizeOffset); } + // Use subtraction-based comparison (matches the runtime helper) to avoid + // alloc_ptr + size overflow: available = combined_limit - alloc_ptr; + // if (size > available) goto slowPath; + emit->emitIns_R_AR(INS_mov, EA_PTRSIZE, dstReg, allocCtxReg, (int)allocInfo->combinedLimitFieldOffset); + emit->emitIns_R_AR(INS_sub, EA_PTRSIZE, dstReg, allocCtxReg, (int)allocInfo->allocPtrFieldOffset); + emit->emitIns_R_R(INS_cmp, EA_PTRSIZE, tmpReg, dstReg); + inst_JMP(EJ_ja, slowPath); + + // Allocation fits. dstReg = alloc_ptr (the new object). emit->emitIns_R_AR(INS_mov, EA_PTRSIZE, dstReg, allocCtxReg, (int)allocInfo->allocPtrFieldOffset); - emit->emitIns_R_ARX(INS_lea, EA_PTRSIZE, tmpReg, dstReg, tmpReg, 1, 0); - emit->emitIns_R_AR(INS_cmp, EA_PTRSIZE, tmpReg, allocCtxReg, (int)allocInfo->combinedLimitFieldOffset); - BasicBlock* slowPath = genCreateTempLabel(); - inst_JMP(EJ_ja, slowPath); + // Compute and store new alloc_ptr = alloc_ptr + size + emit->emitIns_R_R(INS_add, EA_PTRSIZE, tmpReg, dstReg); + emit->emitIns_AR_R(INS_mov, EA_PTRSIZE, tmpReg, allocCtxReg, (int)allocInfo->allocPtrFieldOffset); - // ---- Fast path ---- + // Set MethodTable pointer on the new object emit->emitIns_AR_R(INS_mov, EA_PTRSIZE, mtReg, dstReg, (int)allocInfo->objectMethodTableOffset); if (isArray) { emit->emitIns_AR_R(INS_mov, EA_4BYTE, lenReg, dstReg, (int)allocInfo->arrayLengthOffset); } - emit->emitIns_AR_R(INS_mov, EA_PTRSIZE, tmpReg, allocCtxReg, (int)allocInfo->allocPtrFieldOffset); GetEmitter()->emitEnableGC(); diff --git a/src/coreclr/jit/objectalloc.cpp b/src/coreclr/jit/objectalloc.cpp index 2270ff5471daf1..dd17a1c2dc8d18 100644 --- a/src/coreclr/jit/objectalloc.cpp +++ b/src/coreclr/jit/objectalloc.cpp @@ -1278,7 +1278,7 @@ bool ObjectAllocator::MorphAllocObjNodes() if (helper == CORINFO_HELP_NEWARR_1_VC || helper == CORINFO_HELP_NEWARR_1_PTR) { const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocCtxInfo = m_compiler->compGetAllocContextInfo(); - if (allocCtxInfo->supported && (TargetOS::IsWindows || TargetOS::IsUnix)) + if (allocCtxInfo->supported) { JITDUMP("Marking array allocation [%06u] for inline expansion\n", m_compiler->dspTreeID(call)); @@ -1362,14 +1362,13 @@ void ObjectAllocator::MorphAllocObjNode(AllocationCandidate& candidate) #ifdef TARGET_AMD64 // Check if we can expand the allocation inline in codegen. - // Currently only Windows x64 is supported. + // Supported on Windows x64 and non-Apple Unix x64. const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocCtxInfo = m_compiler->compGetAllocContextInfo(); if (allocObj->gtNewHelper == CORINFO_HELP_NEWSFAST && !allocObj->gtHelperHasSideEffects && - allocCtxInfo->supported && (TargetOS::IsWindows || TargetOS::IsUnix) && - m_compiler->opts.OptimizationEnabled() && JitConfig.JitInlineAllocFast() != 0) + allocCtxInfo->supported && m_compiler->opts.OptimizationEnabled() && + JitConfig.JitInlineAllocFast() != 0) { JITDUMP("Marking allocation [%06u] for inline expansion\n", m_compiler->dspTreeID(allocObj)); - // Morph to helper call, but mark it for inline expansion in codegen GenTree* const newData = MorphAllocObjNodeIntoHelperCall(allocObj); newData->AsCall()->gtCallMoreFlags |= GTF_CALL_M_EXPAND_INLINE_ALLOC; stmtExpr->AsLclVar()->Data() = newData; @@ -1397,7 +1396,7 @@ void ObjectAllocator::MorphAllocObjNode(AllocationCandidate& candidate) m_compiler->opts.OptimizationEnabled() && JitConfig.JitInlineAllocFast() != 0) { const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocCtxInfo = m_compiler->compGetAllocContextInfo(); - if (allocCtxInfo->supported && (TargetOS::IsWindows || TargetOS::IsUnix)) + if (allocCtxInfo->supported) { JITDUMP("Marking array allocation [%06u] for inline expansion\n", m_compiler->dspTreeID(call)); call->gtCallMoreFlags |= GTF_CALL_M_EXPAND_INLINE_ALLOC; diff --git a/src/coreclr/vm/threadstatics.cpp b/src/coreclr/vm/threadstatics.cpp index 76ee48fb8dd4ea..b6e13608740f5b 100644 --- a/src/coreclr/vm/threadstatics.cpp +++ b/src/coreclr/vm/threadstatics.cpp @@ -1204,15 +1204,15 @@ void GetObjectAllocContextTlsInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) STANDARD_VM_CONTRACT; #if defined(TARGET_WINDOWS) - pInfo->tlsIndex.addr = (void*)&_tls_index; - pInfo->tlsIndex.accessType = IAT_PVALUE; + pInfo->tlsIndex.addr = (void*)static_cast(_tls_index); + pInfo->tlsIndex.accessType = IAT_VALUE; pInfo->offsetOfThreadLocalStoragePointer = offsetof(_TEB, ThreadLocalStoragePointer); pInfo->tlsRoot.addr = (void*)(uintptr_t)ThreadLocalOffset(&t_runtime_thread_locals); pInfo->tlsRoot.accessType = IAT_VALUE; #elif defined(TARGET_APPLE) && defined(TARGET_AMD64) - uint8_t* p = reinterpret_cast(&GetRuntimeThreadLocalsThreadVarsAddress); - pInfo->threadVarsSection = GetThreadVarsSectionAddressFromDesc(p); + // macOS TLVP model not yet implemented in JIT codegen + pInfo->supported = false; #elif defined(TARGET_AMD64) pInfo->tlsGetAddrFtnPtr = reinterpret_cast(&__tls_get_addr); From 49a621b5e1c1bcf5604d91a79e7368a9474f305b Mon Sep 17 00:00:00 2001 From: EgorBo Date: Thu, 12 Mar 2026 17:41:54 +0100 Subject: [PATCH 08/10] ci fix --- src/coreclr/jit/codegenxarch.cpp | 71 +++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index e12568a791d7bb..3cc50c75b41c8a 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1766,6 +1766,62 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) emitter* emit = GetEmitter(); + // Try to extract compile-time constants from the call arguments. + // The first arg is the MethodTable handle; for arrays, the second is the element count. + CORINFO_CLASS_HANDLE clsHnd = nullptr; + unsigned constLen = 0; + bool hasConstLen = false; + + CallArg* firstArg = call->gtArgs.GetArgByIndex(0); + if (firstArg != nullptr) + { + GenTree* argNode = firstArg->GetNode(); + if (argNode->OperIs(GT_PUTARG_REG)) + argNode = argNode->gtGetOp1(); + if (argNode->IsIconHandle(GTF_ICON_CLASS_HDL)) + clsHnd = (CORINFO_CLASS_HANDLE)argNode->AsIntCon()->gtIconVal; + } + + if (isArray) + { + CallArg* secondArg = call->gtArgs.GetArgByIndex(1); + if (secondArg != nullptr) + { + GenTree* argNode = secondArg->GetNode(); + if (argNode->OperIs(GT_PUTARG_REG)) + argNode = argNode->gtGetOp1(); + if (argNode->IsCnsIntOrI() && argNode->AsIntCon()->gtIconVal >= 0 && + argNode->AsIntCon()->gtIconVal <= 0x7FFFFFFF) + { + constLen = (unsigned)argNode->AsIntCon()->gtIconVal; + hasConstLen = true; + } + } + } + + // Constant-fold the allocation size when possible. + // For objects: read m_BaseSize from the MethodTable at JIT time. + // For arrays: compute ALIGN8(arrayBaseSize + constLen * componentSize). + unsigned constSize = 0; + if (clsHnd != nullptr) + { + if (!isArray) + { + // Read m_BaseSize directly from the MethodTable at JIT time. + constSize = *(unsigned*)((uint8_t*)(void*)clsHnd + allocInfo->methodTableBaseSizeOffset); + } + else if (hasConstLen) + { + unsigned componentSize = *(uint16_t*)((uint8_t*)(void*)clsHnd + allocInfo->methodTableComponentSizeOffset); + uint64_t totalSize = (uint64_t)allocInfo->arrayBaseSize + (uint64_t)constLen * componentSize; + totalSize = (totalSize + 7) & ~(uint64_t)7; + if (totalSize <= 0x7FFFFFFF) + { + constSize = (unsigned)totalSize; + } + } + } + // ---- TLS access: get pointer to ee_alloc_context ---- if (TargetOS::IsWindows) { @@ -1799,7 +1855,13 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) assert(allocInfo->tlsGetAddrFtnPtr != nullptr); instGen_Set_Reg_To_Imm(EA_PTRSIZE, REG_R11, (ssize_t)allocInfo->tlsGetAddrFtnPtr); - emit->emitIns_R(INS_call, EA_PTRSIZE, REG_R11); + { + EmitCallParams callParams; + callParams.callType = EC_INDIR_R; + callParams.ireg = REG_R11; + callParams.noSafePoint = true; + genEmitCallWithCurrentGC(callParams); + } emit->emitIns_Mov(INS_mov, EA_PTRSIZE, REG_R10, REG_RAX, /* canSkip */ false); allocCtxReg = REG_R10; @@ -1820,7 +1882,12 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) // ---- Bump allocation (non-GC-interruptible) ---- GetEmitter()->emitDisableGC(); - if (isArray) + if (constSize > 0) + { + // Size is known at JIT time — use immediate constant. + instGen_Set_Reg_To_Imm(EA_PTRSIZE, tmpReg, (ssize_t)constSize); + } + else if (isArray) { // Validate element count: must be in [0, 0x7FFFFFFF]. emit->emitIns_R_I(INS_cmp, EA_PTRSIZE, lenReg, 0x7FFFFFFF); From 7441ac1b555b60b01cf83d8daab5ec3734b30797 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Thu, 12 Mar 2026 20:15:35 +0100 Subject: [PATCH 09/10] Refactor inline object allocation: remove arrays, add ARM64 support - Remove NEWARR_1_VC/PTR inline expansion from objectalloc.cpp and codegen - Remove constant-size folding at codegen time (no more JIT-time MT reads) - Remove objectMethodTableOffset field (MT pointer always at offset 0) - Remove arrayLengthOffset, arrayBaseSize, methodTableComponentSizeOffset - Remove threadVarsSection (macOS not supported for inline alloc) - Add tlsRootOffset field for Linux ARM64 (pre-computed tpidr_el0 offset) - Add genInlineAllocCall for ARM64 in codegenarmarch.cpp - Windows: x18 (TEB) + TLS array + index + offset pattern - Linux: mrs tpidr_el0 + pre-computed offset (no function call needed) - macOS: not supported (set supported=false) - Add GetRuntimeThreadLocalsVariableOffset assembly stub for ARM64 Linux - Update CORINFO_OBJECT_ALLOC_CONTEXT_INFO in corinfo.h, CorInfoTypes.cs - Update SuperPMI agnostic struct and methodcontext rec/dmp/rep - Update jitinterface.cpp and threadstatics.cpp for new struct layout - Change TARGET_AMD64 guards to TARGET_AMD64 || TARGET_ARM64 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/inc/corinfo.h | 20 +-- src/coreclr/jit/codegen.h | 2 +- src/coreclr/jit/codegenarmarch.cpp | 142 ++++++++++++++++++ src/coreclr/jit/codegenxarch.cpp | 115 ++------------ src/coreclr/jit/objectalloc.cpp | 48 +----- .../tools/Common/JitInterface/CorInfoTypes.cs | 6 +- .../tools/superpmi/superpmi-shared/agnostic.h | 6 +- .../superpmi-shared/methodcontext.cpp | 20 +-- src/coreclr/vm/arm64/asmhelpers.S | 18 +++ src/coreclr/vm/jitinterface.cpp | 8 +- src/coreclr/vm/threadstatics.cpp | 8 +- 11 files changed, 194 insertions(+), 199 deletions(-) diff --git a/src/coreclr/inc/corinfo.h b/src/coreclr/inc/corinfo.h index 746aeaf1c00129..6cee88067477e8 100644 --- a/src/coreclr/inc/corinfo.h +++ b/src/coreclr/inc/corinfo.h @@ -1711,22 +1711,16 @@ struct CORINFO_OBJECT_ALLOC_CONTEXT_INFO uint32_t allocPtrFieldOffset; // Offset of alloc_ptr uint32_t combinedLimitFieldOffset; // Offset of combined_limit - // Object/MethodTable layout offsets - uint32_t objectMethodTableOffset; // Offset of MethodTable* in Object (0) - uint32_t methodTableBaseSizeOffset; // Offset of m_BaseSize in MethodTable (4) - - // Array layout info - uint32_t arrayLengthOffset; // Offset of m_NumComponents in ArrayBase (8) - uint32_t arrayBaseSize; // Fixed overhead for SZ arrays (SZARRAY_BASE_SIZE, 0x18) - uint32_t methodTableComponentSizeOffset; // Offset of component size in MethodTable (0, low 16 bits of m_dwFlags) + // MethodTable layout offset + uint32_t methodTableBaseSizeOffset; // Offset of m_BaseSize in MethodTable // TLS access info (platform-specific) - CORINFO_CONST_LOOKUP tlsIndex; // Windows: address of _tls_index (IAT_PVALUE) - uint32_t offsetOfThreadLocalStoragePointer; // Windows: TEB offset for TLS array (0x58 on x64) + CORINFO_CONST_LOOKUP tlsIndex; // Windows: address of _tls_index (IAT_VALUE) + uint32_t offsetOfThreadLocalStoragePointer; // Windows: TEB offset for TLS array (0x58 on x64, 0x58 on ARM64) CORINFO_CONST_LOOKUP tlsRoot; // Windows: byte offset from the module TLS base to t_runtime_thread_locals (IAT_VALUE); - // Linux: TLSGD descriptor address; macOS: TLVP descriptor address - void* tlsGetAddrFtnPtr; // Linux: address of __tls_get_addr - void* threadVarsSection; // macOS: section address for TLVP + // Linux x64: TLSGD descriptor address + void* tlsGetAddrFtnPtr; // Linux x64: address of __tls_get_addr + size_t tlsRootOffset; // Linux ARM64: pre-computed tpidr_el0 offset to t_runtime_thread_locals }; //---------------------------------------------------------------------------- diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index f9a505677a94da..a3b98f199e0c4e 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1152,7 +1152,7 @@ class CodeGen final : public CodeGenInterface void genCodeForStoreLclVar(GenTreeLclVar* tree); void genCodeForReturnTrap(GenTreeOp* tree); void genCodeForAllocObj(GenTreeAllocObj* tree); -#ifdef TARGET_AMD64 +#if defined(TARGET_AMD64) || defined(TARGET_ARM64) void genInlineAllocCall(GenTreeCall* call); #endif void genCodeForStoreInd(GenTreeStoreInd* tree); diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index fcef600921e254..4293d3c8d50e6e 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -3050,11 +3050,153 @@ void CodeGen::genCodeForInitBlkLoop(GenTreeBlk* initBlkNode) } } +//------------------------------------------------------------------------ +// genInlineAllocCall: Expand a CORINFO_HELP_NEWSFAST call inline with a +// bump-pointer fast path and a slow-path fallback to the helper. +// +// On ARM64, the allocation sequence is: +// 1. TLS access to get the ee_alloc_context +// - Windows: x18 (TEB) + TLS array + index + offset +// - Linux: mrs xN, tpidr_el0 + pre-computed offset +// 2. Bump-pointer allocation (non-GC-interruptible) +// 3. If allocation doesn't fit: fall through to the normal helper call +// +#ifdef TARGET_ARM64 +void CodeGen::genInlineAllocCall(GenTreeCall* call) +{ + const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocInfo = m_compiler->compGetAllocContextInfo(); + assert(allocInfo->supported); + + genCallPlaceRegArgs(call); + + regNumber dstReg = call->GetRegNum(); + regNumber mtReg = REG_ARG_0; // x0 + + // Use IP0/IP1 (x16/x17) as scratch — they are caller-saved and not arg regs. + regNumber allocCtxReg = REG_IP0; // x16 + regNumber tmpReg = REG_IP1; // x17 + + // Since this replaces a call, all caller-saved registers except mtReg (x0) are free. + // We use x1 to save mtReg, and x2 to hold alloc_ptr during the bump allocation. + regNumber savedMtReg = REG_R1; + regNumber allocPtrScratch = REG_R2; + + emitter* emit = GetEmitter(); + + // ---- TLS access: get pointer to ee_alloc_context ---- + if (TargetOS::IsWindows) + { + // Windows ARM64: x18 holds TEB + // ldr allocCtxReg, [x18, #offsetOfTLS] // TEB -> TLS array + // mov tmpReg, #_tls_index + // ldr allocCtxReg, [allocCtxReg, tmpReg, lsl #3] + // add allocCtxReg, allocCtxReg, #tlsRoot + emit->emitIns_R_R_I(INS_ldr, EA_PTRSIZE, allocCtxReg, REG_R18, + (int)allocInfo->offsetOfThreadLocalStoragePointer); + + assert(allocInfo->tlsIndex.accessType == IAT_VALUE); + instGen_Set_Reg_To_Imm(EA_PTRSIZE, tmpReg, (ssize_t)allocInfo->tlsIndex.addr); + + emit->emitIns_R_R_R_Ext(INS_ldr, EA_PTRSIZE, allocCtxReg, allocCtxReg, tmpReg, INS_OPTS_LSL, 3); + + assert(allocInfo->tlsRoot.accessType == IAT_VALUE); + ssize_t tlsRootVal = (ssize_t)allocInfo->tlsRoot.addr; + instGen_Set_Reg_To_Imm(EA_PTRSIZE, tmpReg, tlsRootVal); + emit->emitIns_R_R_R(INS_add, EA_PTRSIZE, allocCtxReg, allocCtxReg, tmpReg); + } + else + { + // Linux ARM64: mrs xN, tpidr_el0 + pre-computed offset. No function call needed! + emit->emitIns_R(INS_mrs_tpid0, EA_PTRSIZE, allocCtxReg); + if (allocInfo->tlsRootOffset != 0) + { + instGen_Set_Reg_To_Imm(EA_PTRSIZE, tmpReg, (ssize_t)allocInfo->tlsRootOffset); + emit->emitIns_R_R_R(INS_add, EA_PTRSIZE, allocCtxReg, allocCtxReg, tmpReg); + } + } + + BasicBlock* slowPath = genCreateTempLabel(); + + // ---- Bump allocation (non-GC-interruptible) ---- + emit->emitDisableGC(); + + // Save mtReg so we can reuse x0 as a scratch register. + emit->emitIns_Mov(INS_mov, EA_PTRSIZE, savedMtReg, mtReg, /* canSkip */ false); + + // Load m_BaseSize (32-bit) from the MethodTable + emit->emitIns_R_R_I(INS_ldr, EA_4BYTE, tmpReg, mtReg, (int)allocInfo->methodTableBaseSizeOffset); + + // Load alloc_ptr and combined_limit + emit->emitIns_R_R_I(INS_ldr, EA_PTRSIZE, allocPtrScratch, allocCtxReg, (int)allocInfo->allocPtrFieldOffset); + emit->emitIns_R_R_I(INS_ldr, EA_PTRSIZE, dstReg, allocCtxReg, (int)allocInfo->combinedLimitFieldOffset); + + // available = combined_limit - alloc_ptr; if (baseSize > available) goto slowPath + emit->emitIns_R_R_R(INS_sub, EA_PTRSIZE, dstReg, dstReg, allocPtrScratch); + emit->emitIns_R_R(INS_cmp, EA_PTRSIZE, tmpReg, dstReg); + inst_JMP(EJ_hi, slowPath); + + // Fast path: allocation fits. + // new_alloc_ptr = alloc_ptr + baseSize + emit->emitIns_R_R_R(INS_add, EA_PTRSIZE, tmpReg, allocPtrScratch, tmpReg); + + // Store MethodTable pointer at offset 0 of the new object + emit->emitIns_R_R_I(INS_str, EA_PTRSIZE, savedMtReg, allocPtrScratch, 0); + + // Update alloc_ptr in the ee_alloc_context + emit->emitIns_R_R_I(INS_str, EA_PTRSIZE, tmpReg, allocCtxReg, (int)allocInfo->allocPtrFieldOffset); + + // Result = alloc_ptr (the new object) + emit->emitIns_Mov(INS_mov, EA_PTRSIZE, dstReg, allocPtrScratch, /* canSkip */ true); + + emit->emitEnableGC(); + + BasicBlock* done = genCreateTempLabel(); + inst_JMP(EJ_jmp, done); + + // ---- Slow path ---- + genDefineTempLabel(slowPath); + + // Restore mtReg for the helper call + emit->emitIns_Mov(INS_mov, EA_PTRSIZE, mtReg, savedMtReg, /* canSkip */ false); + + genEmitHelperCall(CORINFO_HELP_NEWSFAST, 0, EA_PTRSIZE); + + // Helper returns the new object in x0. + if (dstReg != REG_INTRET) + { + inst_Mov(TYP_REF, dstReg, REG_INTRET, /* canSkip */ false); + } + + // ---- Done ---- + genDefineTempLabel(done); + + gcInfo.gcMarkRegPtrVal(dstReg, TYP_REF); + + if (call->GetRegNum() != dstReg) + { + inst_Mov(TYP_REF, call->GetRegNum(), dstReg, /* canSkip */ false); + gcInfo.gcMarkRegPtrVal(call->GetRegNum(), TYP_REF); + gcInfo.gcMarkRegSetNpt(genRegMask(dstReg)); + } + + genProduceReg(call); +} +#endif // TARGET_ARM64 + //------------------------------------------------------------------------ // genCall: Produce code for a GT_CALL node // void CodeGen::genCall(GenTreeCall* call) { +#ifdef TARGET_ARM64 + // Check if this is an allocation helper call marked for inline expansion + if ((call->gtCallMoreFlags & GTF_CALL_M_EXPAND_INLINE_ALLOC) != 0) + { + genInlineAllocCall(call); + return; + } +#endif + genCallPlaceRegArgs(call); // Insert a null check on "this" pointer if asked. diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 3cc50c75b41c8a..55a37cef51cbd2 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -1752,76 +1752,16 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocInfo = m_compiler->compGetAllocContextInfo(); assert(allocInfo->supported); - CorInfoHelpFunc helperNum = call->GetHelperNum(); - bool isArray = (helperNum == CORINFO_HELP_NEWARR_1_VC || helperNum == CORINFO_HELP_NEWARR_1_PTR); - genCallPlaceRegArgs(call); regNumber dstReg = call->GetRegNum(); regNumber mtReg = REG_ARG_0; - regNumber lenReg = isArray ? REG_ARG_1 : REG_NA; regNumber allocCtxReg = REG_R10; regNumber tmpReg = REG_R11; emitter* emit = GetEmitter(); - // Try to extract compile-time constants from the call arguments. - // The first arg is the MethodTable handle; for arrays, the second is the element count. - CORINFO_CLASS_HANDLE clsHnd = nullptr; - unsigned constLen = 0; - bool hasConstLen = false; - - CallArg* firstArg = call->gtArgs.GetArgByIndex(0); - if (firstArg != nullptr) - { - GenTree* argNode = firstArg->GetNode(); - if (argNode->OperIs(GT_PUTARG_REG)) - argNode = argNode->gtGetOp1(); - if (argNode->IsIconHandle(GTF_ICON_CLASS_HDL)) - clsHnd = (CORINFO_CLASS_HANDLE)argNode->AsIntCon()->gtIconVal; - } - - if (isArray) - { - CallArg* secondArg = call->gtArgs.GetArgByIndex(1); - if (secondArg != nullptr) - { - GenTree* argNode = secondArg->GetNode(); - if (argNode->OperIs(GT_PUTARG_REG)) - argNode = argNode->gtGetOp1(); - if (argNode->IsCnsIntOrI() && argNode->AsIntCon()->gtIconVal >= 0 && - argNode->AsIntCon()->gtIconVal <= 0x7FFFFFFF) - { - constLen = (unsigned)argNode->AsIntCon()->gtIconVal; - hasConstLen = true; - } - } - } - - // Constant-fold the allocation size when possible. - // For objects: read m_BaseSize from the MethodTable at JIT time. - // For arrays: compute ALIGN8(arrayBaseSize + constLen * componentSize). - unsigned constSize = 0; - if (clsHnd != nullptr) - { - if (!isArray) - { - // Read m_BaseSize directly from the MethodTable at JIT time. - constSize = *(unsigned*)((uint8_t*)(void*)clsHnd + allocInfo->methodTableBaseSizeOffset); - } - else if (hasConstLen) - { - unsigned componentSize = *(uint16_t*)((uint8_t*)(void*)clsHnd + allocInfo->methodTableComponentSizeOffset); - uint64_t totalSize = (uint64_t)allocInfo->arrayBaseSize + (uint64_t)constLen * componentSize; - totalSize = (totalSize + 7) & ~(uint64_t)7; - if (totalSize <= 0x7FFFFFFF) - { - constSize = (unsigned)totalSize; - } - } - } - // ---- TLS access: get pointer to ee_alloc_context ---- if (TargetOS::IsWindows) { @@ -1838,17 +1778,10 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) } else { - // Linux x64: call __tls_get_addr. Save arg registers on the stack. - // Always push an even number of 8-byte values for 16-byte stack alignment. + // Linux x64: call __tls_get_addr. Save arg register on the stack. + // Push an even number of 8-byte values for 16-byte stack alignment. emit->emitIns_R(INS_push, EA_PTRSIZE, mtReg); - if (isArray) - { - emit->emitIns_R(INS_push, EA_PTRSIZE, lenReg); - } - else - { - emit->emitIns_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, 8); - } + emit->emitIns_R_I(INS_sub, EA_PTRSIZE, REG_SPBASE, 8); assert(allocInfo->tlsRoot.accessType == IAT_VALUE); instGen_Set_Reg_To_Imm(EA_PTRSIZE, REG_ARG_0, (ssize_t)allocInfo->tlsRoot.addr); @@ -1866,14 +1799,7 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) emit->emitIns_Mov(INS_mov, EA_PTRSIZE, REG_R10, REG_RAX, /* canSkip */ false); allocCtxReg = REG_R10; - if (isArray) - { - emit->emitIns_R(INS_pop, EA_PTRSIZE, lenReg); - } - else - { - emit->emitIns_R_I(INS_add, EA_PTRSIZE, REG_SPBASE, 8); - } + emit->emitIns_R_I(INS_add, EA_PTRSIZE, REG_SPBASE, 8); emit->emitIns_R(INS_pop, EA_PTRSIZE, mtReg); } @@ -1882,27 +1808,8 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) // ---- Bump allocation (non-GC-interruptible) ---- GetEmitter()->emitDisableGC(); - if (constSize > 0) - { - // Size is known at JIT time — use immediate constant. - instGen_Set_Reg_To_Imm(EA_PTRSIZE, tmpReg, (ssize_t)constSize); - } - else if (isArray) - { - // Validate element count: must be in [0, 0x7FFFFFFF]. - emit->emitIns_R_I(INS_cmp, EA_PTRSIZE, lenReg, 0x7FFFFFFF); - inst_JMP(EJ_ja, slowPath); - - // Array total size = ALIGN8(arrayBaseSize + elementCount * componentSize) - emit->emitIns_R_AR(INS_movzx, EA_2BYTE, tmpReg, mtReg, (int)allocInfo->methodTableComponentSizeOffset); - emit->emitIns_R_R(INS_imul, EA_PTRSIZE, tmpReg, lenReg); - emit->emitIns_R_I(INS_add, EA_PTRSIZE, tmpReg, (ssize_t)(allocInfo->arrayBaseSize + 7)); - emit->emitIns_R_I(INS_and, EA_PTRSIZE, tmpReg, -8); - } - else - { - emit->emitIns_R_AR(INS_mov, EA_4BYTE, tmpReg, mtReg, (int)allocInfo->methodTableBaseSizeOffset); - } + // Load m_BaseSize from the MethodTable + emit->emitIns_R_AR(INS_mov, EA_4BYTE, tmpReg, mtReg, (int)allocInfo->methodTableBaseSizeOffset); // Use subtraction-based comparison (matches the runtime helper) to avoid // alloc_ptr + size overflow: available = combined_limit - alloc_ptr; @@ -1919,12 +1826,8 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) emit->emitIns_R_R(INS_add, EA_PTRSIZE, tmpReg, dstReg); emit->emitIns_AR_R(INS_mov, EA_PTRSIZE, tmpReg, allocCtxReg, (int)allocInfo->allocPtrFieldOffset); - // Set MethodTable pointer on the new object - emit->emitIns_AR_R(INS_mov, EA_PTRSIZE, mtReg, dstReg, (int)allocInfo->objectMethodTableOffset); - if (isArray) - { - emit->emitIns_AR_R(INS_mov, EA_4BYTE, lenReg, dstReg, (int)allocInfo->arrayLengthOffset); - } + // Set MethodTable pointer on the new object (always at offset 0) + emit->emitIns_AR_R(INS_mov, EA_PTRSIZE, mtReg, dstReg, 0); GetEmitter()->emitEnableGC(); @@ -1934,7 +1837,7 @@ void CodeGen::genInlineAllocCall(GenTreeCall* call) // ---- Slow path ---- genDefineTempLabel(slowPath); - genEmitHelperCall(helperNum, 0, EA_PTRSIZE); + genEmitHelperCall(CORINFO_HELP_NEWSFAST, 0, EA_PTRSIZE); // Helper returns the new object in rax. if (dstReg != REG_INTRET) diff --git a/src/coreclr/jit/objectalloc.cpp b/src/coreclr/jit/objectalloc.cpp index dd17a1c2dc8d18..79c199b56c3403 100644 --- a/src/coreclr/jit/objectalloc.cpp +++ b/src/coreclr/jit/objectalloc.cpp @@ -1267,26 +1267,6 @@ bool ObjectAllocator::MorphAllocObjNodes() if (allocType == OAT_NONE) { -#ifdef TARGET_AMD64 - // Even if escape analysis doesn't apply, mark eligible array allocation - // calls for inline expansion in codegen. - if (data->IsHelperCall() && m_compiler->opts.OptimizationEnabled() && - JitConfig.JitInlineAllocFast() != 0) - { - GenTreeCall* const call = data->AsCall(); - CorInfoHelpFunc helper = call->GetHelperNum(); - if (helper == CORINFO_HELP_NEWARR_1_VC || helper == CORINFO_HELP_NEWARR_1_PTR) - { - const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocCtxInfo = m_compiler->compGetAllocContextInfo(); - if (allocCtxInfo->supported) - { - JITDUMP("Marking array allocation [%06u] for inline expansion\n", - m_compiler->dspTreeID(call)); - call->gtCallMoreFlags |= GTF_CALL_M_EXPAND_INLINE_ALLOC; - } - } - } -#endif continue; } @@ -1360,9 +1340,8 @@ void ObjectAllocator::MorphAllocObjNode(AllocationCandidate& candidate) GenTree* const stmtExpr = candidate.m_tree; GenTreeAllocObj* allocObj = stmtExpr->AsLclVar()->Data()->AsAllocObj(); -#ifdef TARGET_AMD64 +#if defined(TARGET_AMD64) || defined(TARGET_ARM64) // Check if we can expand the allocation inline in codegen. - // Supported on Windows x64 and non-Apple Unix x64. const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocCtxInfo = m_compiler->compGetAllocContextInfo(); if (allocObj->gtNewHelper == CORINFO_HELP_NEWSFAST && !allocObj->gtHelperHasSideEffects && allocCtxInfo->supported && m_compiler->opts.OptimizationEnabled() && @@ -1375,7 +1354,7 @@ void ObjectAllocator::MorphAllocObjNode(AllocationCandidate& candidate) stmtExpr->AddAllEffectsFlags(newData); } else -#endif // TARGET_AMD64 +#endif // TARGET_AMD64 || TARGET_ARM64 { GenTree* const newData = MorphAllocObjNodeIntoHelperCall(allocObj); stmtExpr->AsLclVar()->Data() = newData; @@ -1383,29 +1362,6 @@ void ObjectAllocator::MorphAllocObjNode(AllocationCandidate& candidate) } } -#ifdef TARGET_AMD64 - // Mark non-stack-allocated array calls for inline expansion - if (candidate.m_allocType == OAT_NEWARR) - { - GenTree* const data = candidate.m_tree->AsLclVar()->Data(); - if (data->IsHelperCall()) - { - GenTreeCall* const call = data->AsCall(); - CorInfoHelpFunc helper = call->GetHelperNum(); - if ((helper == CORINFO_HELP_NEWARR_1_VC || helper == CORINFO_HELP_NEWARR_1_PTR) && - m_compiler->opts.OptimizationEnabled() && JitConfig.JitInlineAllocFast() != 0) - { - const CORINFO_OBJECT_ALLOC_CONTEXT_INFO* allocCtxInfo = m_compiler->compGetAllocContextInfo(); - if (allocCtxInfo->supported) - { - JITDUMP("Marking array allocation [%06u] for inline expansion\n", m_compiler->dspTreeID(call)); - call->gtCallMoreFlags |= GTF_CALL_M_EXPAND_INLINE_ALLOC; - } - } - } - } -#endif - if (IsTrackedLocal(lclNum)) { AddConnGraphEdgeIndex(LocalToIndex(lclNum), m_unknownSourceIndex); diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs b/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs index bcbf9567eb81ac..e44515f94b679e 100644 --- a/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs +++ b/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs @@ -1281,16 +1281,12 @@ public unsafe struct CORINFO_OBJECT_ALLOC_CONTEXT_INFO public byte supported; public uint allocPtrFieldOffset; public uint combinedLimitFieldOffset; - public uint objectMethodTableOffset; public uint methodTableBaseSizeOffset; - public uint arrayLengthOffset; - public uint arrayBaseSize; - public uint methodTableComponentSizeOffset; public CORINFO_CONST_LOOKUP tlsIndex; public uint offsetOfThreadLocalStoragePointer; public CORINFO_CONST_LOOKUP tlsRoot; public nuint tlsGetAddrFtnPtr; - public nuint threadVarsSection; + public nuint tlsRootOffset; }; // System V struct passing diff --git a/src/coreclr/tools/superpmi/superpmi-shared/agnostic.h b/src/coreclr/tools/superpmi/superpmi-shared/agnostic.h index d95aa129c7dd30..bef4404cbd67e0 100644 --- a/src/coreclr/tools/superpmi/superpmi-shared/agnostic.h +++ b/src/coreclr/tools/superpmi/superpmi-shared/agnostic.h @@ -577,16 +577,12 @@ struct Agnostic_GetObjectAllocContextInfo DWORD supported; DWORD allocPtrFieldOffset; DWORD combinedLimitFieldOffset; - DWORD objectMethodTableOffset; DWORD methodTableBaseSizeOffset; - DWORD arrayLengthOffset; - DWORD arrayBaseSize; - DWORD methodTableComponentSizeOffset; Agnostic_CORINFO_CONST_LOOKUP tlsIndex; DWORD offsetOfThreadLocalStoragePointer; Agnostic_CORINFO_CONST_LOOKUP tlsRoot; DWORDLONG tlsGetAddrFtnPtr; - DWORDLONG threadVarsSection; + DWORDLONG tlsRootOffset; }; struct Agnostic_GetClassCtorInitializationInfo diff --git a/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.cpp b/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.cpp index 010e116a45bc20..a2cbe403710d43 100644 --- a/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.cpp +++ b/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.cpp @@ -7823,16 +7823,12 @@ void MethodContext::recGetObjectAllocContextInfo(CORINFO_OBJECT_ALLOC_CONTEXT_IN value.supported = pInfo->supported ? 1 : 0; value.allocPtrFieldOffset = pInfo->allocPtrFieldOffset; value.combinedLimitFieldOffset = pInfo->combinedLimitFieldOffset; - value.objectMethodTableOffset = pInfo->objectMethodTableOffset; value.methodTableBaseSizeOffset = pInfo->methodTableBaseSizeOffset; - value.arrayLengthOffset = pInfo->arrayLengthOffset; - value.arrayBaseSize = pInfo->arrayBaseSize; - value.methodTableComponentSizeOffset = pInfo->methodTableComponentSizeOffset; value.tlsIndex = SpmiRecordsHelper::StoreAgnostic_CORINFO_CONST_LOOKUP(&pInfo->tlsIndex); value.offsetOfThreadLocalStoragePointer = pInfo->offsetOfThreadLocalStoragePointer; value.tlsRoot = SpmiRecordsHelper::StoreAgnostic_CORINFO_CONST_LOOKUP(&pInfo->tlsRoot); value.tlsGetAddrFtnPtr = CastPointer(pInfo->tlsGetAddrFtnPtr); - value.threadVarsSection = CastPointer(pInfo->threadVarsSection); + value.tlsRootOffset = (DWORDLONG)pInfo->tlsRootOffset; DWORD key = 0; GetObjectAllocContextInfo->Add(key, value); @@ -7843,17 +7839,17 @@ void MethodContext::dmpGetObjectAllocContextInfo(DWORD key, const Agnostic_GetOb { printf("GetObjectAllocContextInfo key %u, supported-%u" ", allocPtrFieldOffset-%u, combinedLimitFieldOffset-%u" - ", objectMethodTableOffset-%u, methodTableBaseSizeOffset-%u" + ", methodTableBaseSizeOffset-%u" ", tlsIndex-%s, offsetOfThreadLocalStoragePointer-%u" ", tlsRoot-%s, tlsGetAddrFtnPtr-%016" PRIX64 - ", threadVarsSection-%016" PRIX64, + ", tlsRootOffset-%016" PRIX64, key, value.supported, value.allocPtrFieldOffset, value.combinedLimitFieldOffset, - value.objectMethodTableOffset, value.methodTableBaseSizeOffset, + value.methodTableBaseSizeOffset, SpmiDumpHelper::DumpAgnostic_CORINFO_CONST_LOOKUP(value.tlsIndex).c_str(), value.offsetOfThreadLocalStoragePointer, SpmiDumpHelper::DumpAgnostic_CORINFO_CONST_LOOKUP(value.tlsRoot).c_str(), - value.tlsGetAddrFtnPtr, value.threadVarsSection); + value.tlsGetAddrFtnPtr, value.tlsRootOffset); } void MethodContext::repGetObjectAllocContextInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) @@ -7866,14 +7862,10 @@ void MethodContext::repGetObjectAllocContextInfo(CORINFO_OBJECT_ALLOC_CONTEXT_IN pInfo->supported = (value.supported != 0); pInfo->allocPtrFieldOffset = value.allocPtrFieldOffset; pInfo->combinedLimitFieldOffset = value.combinedLimitFieldOffset; - pInfo->objectMethodTableOffset = value.objectMethodTableOffset; pInfo->methodTableBaseSizeOffset = value.methodTableBaseSizeOffset; - pInfo->arrayLengthOffset = value.arrayLengthOffset; - pInfo->arrayBaseSize = value.arrayBaseSize; - pInfo->methodTableComponentSizeOffset = value.methodTableComponentSizeOffset; pInfo->tlsIndex = SpmiRecordsHelper::RestoreCORINFO_CONST_LOOKUP(value.tlsIndex); pInfo->offsetOfThreadLocalStoragePointer = value.offsetOfThreadLocalStoragePointer; pInfo->tlsRoot = SpmiRecordsHelper::RestoreCORINFO_CONST_LOOKUP(value.tlsRoot); pInfo->tlsGetAddrFtnPtr = (void*)value.tlsGetAddrFtnPtr; - pInfo->threadVarsSection = (void*)value.threadVarsSection; + pInfo->tlsRootOffset = (size_t)value.tlsRootOffset; } diff --git a/src/coreclr/vm/arm64/asmhelpers.S b/src/coreclr/vm/arm64/asmhelpers.S index 76ab00eb6700f3..9d8e9233eec7ea 100644 --- a/src/coreclr/vm/arm64/asmhelpers.S +++ b/src/coreclr/vm/arm64/asmhelpers.S @@ -658,6 +658,24 @@ LEAF_ENTRY GetTLSResolverAddress, _TEXT EPILOG_RETURN LEAF_END GetTLSResolverAddress, _TEXT // ------------------------------------------------------------------ + +// ------------------------------------------------------------------ +// size_t GetRuntimeThreadLocalsVariableOffset() + +// Helper to calculate the offset of native thread local variable `t_runtime_thread_locals` in TCB. +// The offset, after calculation is returned in `x0` register. + +LEAF_ENTRY GetRuntimeThreadLocalsVariableOffset, _TEXT + PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -32 + adrp x0, :tlsdesc:t_runtime_thread_locals + ldr x1, [x0, #:tlsdesc_lo12:t_runtime_thread_locals] + add x0, x0, :tlsdesc_lo12:t_runtime_thread_locals + .tlsdesccall t_runtime_thread_locals + blr x1 + EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, 32 + EPILOG_RETURN +LEAF_END GetRuntimeThreadLocalsVariableOffset, _TEXT +// ------------------------------------------------------------------ #endif // TARGET_ANDROID #endif // !TARGET_APPLE diff --git a/src/coreclr/vm/jitinterface.cpp b/src/coreclr/vm/jitinterface.cpp index 523a078990605a..a2c6b11a0f309d 100644 --- a/src/coreclr/vm/jitinterface.cpp +++ b/src/coreclr/vm/jitinterface.cpp @@ -1376,15 +1376,9 @@ void CEEInfo::getObjectAllocContextInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo pInfo->allocPtrFieldOffset = (uint32_t)(offsetof(ee_alloc_context, m_GCAllocContext) + offsetof(gc_alloc_context, alloc_ptr)); pInfo->combinedLimitFieldOffset = (uint32_t)offsetof(ee_alloc_context, m_CombinedLimit); - // Object/MethodTable layout - pInfo->objectMethodTableOffset = (uint32_t)cdac_data::m_pMethTab; + // MethodTable layout pInfo->methodTableBaseSizeOffset = (uint32_t)cdac_data::BaseSize; - // Array layout - pInfo->arrayLengthOffset = (uint32_t)cdac_data::m_NumComponents; - pInfo->arrayBaseSize = ARRAYBASE_BASESIZE; - pInfo->methodTableComponentSizeOffset = (uint32_t)cdac_data::MTFlags;// component size is low 16 bits of m_dwFlags - // TLS access info - how to reach t_runtime_thread_locals GetObjectAllocContextTlsInfo(pInfo); } diff --git a/src/coreclr/vm/threadstatics.cpp b/src/coreclr/vm/threadstatics.cpp index b6e13608740f5b..95ea89dc476cc5 100644 --- a/src/coreclr/vm/threadstatics.cpp +++ b/src/coreclr/vm/threadstatics.cpp @@ -1210,8 +1210,8 @@ void GetObjectAllocContextTlsInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) pInfo->tlsRoot.addr = (void*)(uintptr_t)ThreadLocalOffset(&t_runtime_thread_locals); pInfo->tlsRoot.accessType = IAT_VALUE; -#elif defined(TARGET_APPLE) && defined(TARGET_AMD64) - // macOS TLVP model not yet implemented in JIT codegen +#elif defined(TARGET_APPLE) + // macOS (both x64 and ARM64) TLVP model not yet implemented in JIT codegen pInfo->supported = false; #elif defined(TARGET_AMD64) @@ -1224,6 +1224,10 @@ void GetObjectAllocContextTlsInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) pInfo->supported = false; } +#elif defined(TARGET_ARM64) + extern "C" size_t GetRuntimeThreadLocalsVariableOffset(); + pInfo->tlsRootOffset = GetRuntimeThreadLocalsVariableOffset(); + #else pInfo->supported = false; #endif // TARGET_WINDOWS From a1c59b93634b88bcbf0209e363fdb7d21186436c Mon Sep 17 00:00:00 2001 From: EgorBo Date: Thu, 12 Mar 2026 22:22:24 +0100 Subject: [PATCH 10/10] fix arm64 ci --- src/coreclr/vm/threadstatics.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/vm/threadstatics.cpp b/src/coreclr/vm/threadstatics.cpp index 95ea89dc476cc5..47911ee254a4f3 100644 --- a/src/coreclr/vm/threadstatics.cpp +++ b/src/coreclr/vm/threadstatics.cpp @@ -1146,6 +1146,7 @@ static void* GetTlsIndexObjectAddress() #elif !defined(TARGET_ANDROID) && defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) extern "C" size_t GetThreadStaticsVariableOffset(); +extern "C" size_t GetRuntimeThreadLocalsVariableOffset(); #endif // !TARGET_ANDROID && TARGET_ARM64 || TARGET_LOONGARCH64 || TARGET_RISCV64 #endif // TARGET_WINDOWS @@ -1225,7 +1226,6 @@ void GetObjectAllocContextTlsInfo(CORINFO_OBJECT_ALLOC_CONTEXT_INFO* pInfo) } #elif defined(TARGET_ARM64) - extern "C" size_t GetRuntimeThreadLocalsVariableOffset(); pInfo->tlsRootOffset = GetRuntimeThreadLocalsVariableOffset(); #else