Skip to content

Commit

Permalink
Make linux-riscv nativeaot port robust (#112736)
Browse files Browse the repository at this point in the history
* Fix AOT exception handling on linux-riscv64

* Apply more suggestions

Co-authored-by: Filip Navara <[email protected]>

* Fix unwind information for allocation helpers

* Restore correct FP registers in StackFrameIterator::UnwindFuncletInvokeThunk

* Fix register addressing in GcInfoDecoder::GetRegisterSlot

* Fix and enable software write watch

* Fix COOP frames layout

* Fix indirections, comparison and other assembly for write barrier checks

* Fix cross-build on Risc-V host

* Disable R2R in stage2 build

* Fix uninitialized value in RhpPInvoke

* Fix register in RhpPInvokeReturn

* Initialize FP/RA pointers when creating StackFrameIterator from native context

* Fix sign on PROLOG_SAVE_REG_PAIR_INDEXED

* Fix more of the logic in write barriers. Needs further audit.

* Fix incorrect unwinding information generated for methods with frame size > PAGE_SIZE

* Fix emitting jump to bad slot helper

* Fix unwind info for universal transitions

* Fix comment

* Fix flipped RA/FP in universal transition unwinding

* WIP: Rewrite thunk code generation

* WIP: Fix RhCommonStub

* Fix layout of universal translation for unwinding

* WIP: Add memory barriers to native AOT asm helpers (#106219)

* WIP: Add fences to RhpCheckedLockCmpXchg/RhpCheckedXchg and fix atomicity guarantees of RhpCheckedXchg

* Preserve registers in INLINE_GET_TLS_VAR

* Preserve a7 as well

* WIP: Attempt to fix masks in IsInProlog/TrailingEpilogueInstructionsCount (to be reviewed)

* WIP: Fix GC hijacking flags and possible return value trashing

* Fix return value trashing

* Actually fix the PTFF_THREAD_HIJACK_HI flag in correct file

* Apply suggestions from code review

Co-authored-by: Tomek Sowiński <[email protected]>

* Update src/coreclr/nativeaot/Runtime/ThunksMapping.cpp

Co-authored-by: Filip Navara <[email protected]>

* Make style consistent

* Flip the sign of PROLOG_SAVE_REG_PAIR_INDEXED to match CoreCLR definition and PROLOG_SAVE_REG_PAIR_NO_FP_INDEXED

* Fix the last change to sign

* Save one mv instruction

* Relax FENCE in R2R helpers

* Apply suggestiosn from CR review

Co-authored-by: Tomek Sowiński <[email protected]>

* Update managed defintion of TransitionBlock too

* Update TLSDESC comments

* Update src/coreclr/nativeaot/Runtime/riscv64/WriteBarriers.S

Co-authored-by: Tomasz Sowiński <[email protected]>

* Relax semantics of a fence in R2R helper to match ARM64

* Add missing fence in RhpAssignRefRiscV64 (matches ARM code and CoreCLR/Risc-V code)

* Match barriers emitted by PalInterlockedOperationBarrier in NativeAOT and equivalent code in CoreCLR in ExchangeObject/CompareExchangeObject

---------

Co-authored-by: Filip Navara <[email protected]>
Co-authored-by: Tomek Sowiński <[email protected]>
Co-authored-by: Jan Kotas <[email protected]>
  • Loading branch information
4 people authored Feb 27, 2025
1 parent e22d62a commit 2129f3f
Show file tree
Hide file tree
Showing 20 changed files with 214 additions and 156 deletions.
2 changes: 2 additions & 0 deletions eng/native/configureplatform.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ if(CLR_CMAKE_HOST_OS STREQUAL linux)
set(CLR_CMAKE_HOST_UNIX_X86 1)
elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL aarch64)
set(CLR_CMAKE_HOST_UNIX_ARM64 1)
elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL riscv64)
set(CLR_CMAKE_HOST_UNIX_RISCV64 1)
else()
clr_unknown_arch()
endif()
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/codegenriscv64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7223,7 +7223,7 @@ void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroe

if (leftFrameSize != 0)
{
genStackPointerAdjustment(-leftFrameSize, REG_SCRATCH, nullptr, /* reportUnwindData */ true);
genStackPointerAdjustment(-leftFrameSize, REG_SCRATCH, nullptr, /* reportUnwindData */ false);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -366,8 +366,6 @@ internal struct ReturnBlock
{
private IntPtr returnValue;
private IntPtr returnValue2;
private IntPtr returnValue3;
private IntPtr returnValue4;
}

[StructLayout(LayoutKind.Sequential)]
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/nativeaot/Runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ endif()

add_definitions(-DFEATURE_BASICFREEZE)
add_definitions(-DFEATURE_CONSERVATIVE_GC)
if(CLR_CMAKE_TARGET_ARCH_AMD64 OR CLR_CMAKE_TARGET_ARCH_ARM64)
if(CLR_CMAKE_TARGET_ARCH_AMD64 OR CLR_CMAKE_TARGET_ARCH_ARM64 OR CLR_CMAKE_TARGET_ARCH_RISCV64)
add_definitions(-DFEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP)
add_definitions(-DFEATURE_MANUALLY_MANAGED_CARD_BUNDLES)
endif()
Expand Down
21 changes: 12 additions & 9 deletions src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,7 @@ void StackFrameIterator::InternalInit(Thread * pThreadToWalk, PTR_PAL_LIMITED_CO
// preserved floating-point registers
//
int32_t preservedFpIndices[] = {8, 9, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27};
for (int i = 0; i < sizeof(preservedFpIndices) / sizeof(preservedFpIndices[0]); i++)
for (int i = 0; i < ARRAY_SIZE(preservedFpIndices); i++)
{
m_RegDisplay.F[preservedFpIndices[i]] = pCtx->F[preservedFpIndices[i]];
}
Expand Down Expand Up @@ -809,6 +809,8 @@ void StackFrameIterator::InternalInit(Thread * pThreadToWalk, NATIVE_CONTEXT* pC
m_RegDisplay.pS9 = (PTR_uintptr_t)PTR_TO_REG(pCtx, S9);
m_RegDisplay.pS10 = (PTR_uintptr_t)PTR_TO_REG(pCtx, S10);
m_RegDisplay.pS11 = (PTR_uintptr_t)PTR_TO_REG(pCtx, S11);
m_RegDisplay.pFP = (PTR_uintptr_t)PTR_TO_REG(pCtx, Fp);
m_RegDisplay.pRA = (PTR_uintptr_t)PTR_TO_REG(pCtx, Ra);

//
// scratch regs
Expand Down Expand Up @@ -1285,9 +1287,10 @@ void StackFrameIterator::UnwindFuncletInvokeThunk()
#elif defined(TARGET_RISCV64)
PTR_uint64_t f = (PTR_uint64_t)(m_RegDisplay.SP);

for (int i = 0; i < 32; i++)
int32_t preservedFpIndices[] = {8, 9, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27};
for (int i = 0; i < ARRAY_SIZE(preservedFpIndices); i++)
{
m_RegDisplay.F[i] = *f++;
m_RegDisplay.F[preservedFpIndices[i]] = *f++;
}

SP = (PTR_uintptr_t)f;
Expand Down Expand Up @@ -1496,12 +1499,12 @@ struct UniversalTransitionStackFrame
// Conservative GC reporting must be applied to everything between the base of the
// ReturnBlock and the top of the StackPassedArgs.
private:
uintptr_t m_pushedRA; // ChildSP+000 CallerSP-0F0 (0x08 bytes) (ra)
uintptr_t m_pushedFP; // ChildSP+008 CallerSP-0E8 (0x08 bytes) (fp)
Fp128 m_fpArgRegs[8]; // ChildSP+010 CallerSP-0E0 (0x80 bytes) (fa0-fa7)
uintptr_t m_returnBlock[4]; // ChildSP+090 CallerSP-060 (0x20 bytes)
uintptr_t m_intArgRegs[8]; // ChildSP+0B0 CallerSP-040 (0x40 bytes) (a0-a7)
uintptr_t m_stackPassedArgs[1]; // ChildSP+0F0 CallerSP+000 (unknown size)
uintptr_t m_pushedFP; // ChildSP+000 CallerSP-0A0 (0x08 bytes) (fp)
uintptr_t m_pushedRA; // ChildSP+008 CallerSP-098 (0x08 bytes) (ra)
uint64_t m_fpArgRegs[8]; // ChildSP+010 CallerSP-090 (0x40 bytes) (fa0-fa7)
uintptr_t m_returnBlock[2]; // ChildSP+050 CallerSP-050 (0x10 bytes)
uintptr_t m_intArgRegs[8]; // ChildSP+060 CallerSP-040 (0x40 bytes) (a0-a7)
uintptr_t m_stackPassedArgs[1]; // ChildSP+0A0 CallerSP+000 (unknown size)

public:
PTR_uintptr_t get_CallerSP() { return GET_POINTER_TO_FIELD(m_stackPassedArgs[0]); }
Expand Down
26 changes: 16 additions & 10 deletions src/coreclr/nativeaot/Runtime/ThunksMapping.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#elif TARGET_LOONGARCH64
#define THUNK_SIZE 16
#elif TARGET_RISCV64
#define THUNK_SIZE 12
#define THUNK_SIZE 20
#else
#define THUNK_SIZE (2 * OS_PAGE_SIZE) // This will cause RhpGetNumThunksPerBlock to return 0
#endif
Expand Down Expand Up @@ -259,21 +259,27 @@ EXTERN_C void* QCALLTYPE RhAllocateThunksMapping()

#elif defined(TARGET_RISCV64)

// auipc t0, %hi(delta) // Load upper immediate with address high bits
// ld t1, %lo(delta)(t0) // Load data from address in (t0 + lower immediate)
// jr t1 // Jump and don't link register
//auipc t1, hi(<delta PC to thunk data address>)
//addi t1, t1, lo(<delta PC to thunk data address>)
//auipc t0, hi(<delta to get to last word in data page>)
//ld t0, (t0)
//jalr zero, t0, 0

int delta = (int)(pCurrentDataAddress - pCurrentThunkAddress);
uint32_t deltaHi = (delta + 0x800) & 0xfffff000;
uint32_t deltaLo = delta << (32 - 12);

*((uint32_t*)pCurrentThunkAddress) = 0x00000297 | deltaHi; // auipc
*((uint32_t*)pCurrentThunkAddress) = 0x00000317 | ((((delta + 0x800) & 0xFFFFF000) >> 12) << 12); // auipc t1, delta[31:12]
pCurrentThunkAddress += 4;

*((uint32_t*)pCurrentThunkAddress) = 0x00030313 | ((delta & 0xFFF) << 20); // addi t1, t1, delta[11:0]
pCurrentThunkAddress += 4;

delta += OS_PAGE_SIZE - POINTER_SIZE - (i * POINTER_SIZE * 2) - 8;
*((uint32_t*)pCurrentThunkAddress) = 0x00000297 | ((((delta + 0x800) & 0xFFFFF000) >> 12) << 12); // auipc t0, delta[31:12]
pCurrentThunkAddress += 4;

*((uint32_t*)pCurrentThunkAddress) = 0x0002B303 | deltaLo; // addi
*((uint32_t*)pCurrentThunkAddress) = 0x0002b283 | ((delta & 0xFFF) << 20); // ld t0, (delta[11:0])(t0)
pCurrentThunkAddress += 4;

*((uint32_t*)pCurrentThunkAddress) = 0x00030067; // jr
*((uint32_t*)pCurrentThunkAddress) = 0x00008282; // jalr zero, t0, 0
pCurrentThunkAddress += 4;

#else
Expand Down
10 changes: 5 additions & 5 deletions src/coreclr/nativeaot/Runtime/riscv64/AllocFast.S
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ LOCAL_LABEL(RhpNewFast_RarePath):

// a3: transition frame

// Preserve the MethodTable in s0
mv s0, a0
// Preserve the MethodTable in s2
mv s2, a0

li a2, 0 // numElements

Expand All @@ -96,7 +96,7 @@ LOCAL_LABEL(NewOutOfMemory):
// This is the OOM failure path. We are going to tail-call to a managed helper that will throw
// an out of memory exception that the caller of this allocator understands.

mv a0, s0 // MethodTable pointer
mv a0, s2 // MethodTable pointer
li a1, 0 // Indicate that we should throw OOM.

POP_COOP_PINVOKE_FRAME
Expand Down Expand Up @@ -243,7 +243,7 @@ LOCAL_LABEL(RhpNewArray_Rare):
PUSH_COOP_PINVOKE_FRAME a3

// Preserve data we will need later into the callee saved registers
mv s0, a0 // Preserve MethodTable
mv s2, a0 // Preserve MethodTable

mv a2, a1 // numElements
li a1, 0 // uFlags
Expand All @@ -264,7 +264,7 @@ LOCAL_LABEL(ArrayOutOfMemory):
// This is the OOM failure path. We are going to tail-call to a managed helper that will throw
// an out of memory exception that the caller of this allocator understands.

mv a0, s0 // MethodTable Pointer
mv a0, s2 // MethodTable Pointer
li a1, 0 // Indicate that we should throw OOM.

POP_COOP_PINVOKE_FRAME
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/nativeaot/Runtime/riscv64/ExceptionHandling.S
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <unixasmmacros.inc>
#include "AsmOffsets.inc"

#define STACKSIZEOF_ExInfo ((SIZEOF__ExInfo + 15)&(~15))
#define STACKSIZEOF_ExInfo ((SIZEOF__ExInfo + 7) & ~7)

#define HARDWARE_EXCEPTION 1
#define SOFTWARE_EXCEPTION 0
Expand Down
14 changes: 7 additions & 7 deletions src/coreclr/nativeaot/Runtime/riscv64/GcProbe.S
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,13 @@

# Perform the rest of the PInvokeTransitionFrame initialization.
sd \threadReg, OFFSETOF__PInvokeTransitionFrame__m_pThread(sp) # Thread * (unused by stackwalker)
sd \BITMASK, (OFFSETOF__PInvokeTransitionFrame__m_pThread + 8)(sp) # Save the register bitmask passed in by caller
sd \BITMASK, OFFSETOF__PInvokeTransitionFrame__m_Flags(sp) # Save the register bitmask passed in by caller

addi \trashReg, sp, PROBE_FRAME_SIZE # Recover value of caller's SP
sd \trashReg, 0x78(sp) # Save caller's SP

# Link the frame into the Thread
mv \trashReg, sp
sd \trashReg, OFFSETOF__Thread__m_pDeferredTransitionFrame(\threadReg)
sd sp, OFFSETOF__Thread__m_pDeferredTransitionFrame(\threadReg)

.endm

Expand Down Expand Up @@ -84,7 +83,9 @@
.macro FixupHijackedCallstack

// a2 <- GetThread()
mv t1, a0
INLINE_GETTHREAD a2
mv a0, t1

// Fix the stack by restoring the original return address
ld ra, OFFSETOF__Thread__m_pvHijackedReturnAddress(a2)
Expand All @@ -100,14 +101,13 @@
NESTED_ENTRY RhpGcProbeHijack, _TEXT, NoHandler
FixupHijackedCallstack

PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, a3
andi t3, a3, 1 << TrapThreadsFlags_TrapThreads_Bit
PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, t3
andi t3, t3, 1 << TrapThreadsFlags_TrapThreads_Bit
bnez t3, LOCAL_LABEL(WaitForGC)
jr ra

LOCAL_LABEL(WaitForGC):
li t6, (DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_A0 + PTFF_SAVE_A1 + PTFF_THREAD_HIJACK_HI)
or t3, t3, t6
li t3, (DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_A0 + PTFF_SAVE_A1 + (PTFF_THREAD_HIJACK_HI << 32))
tail C_FUNC(RhpWaitForGC)
NESTED_END RhpGcProbeHijack

Expand Down
16 changes: 9 additions & 7 deletions src/coreclr/nativeaot/Runtime/riscv64/InteropThunksHelpers.S
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,29 @@
//
// RhCommonStub
//
// INPUT: tp: thunk's data block
// INPUT: t1: thunk's data block
//
// TRASHES: t0, t1, tp
// TRASHES: t0, t1, t2
//
LEAF_ENTRY RhCommonStub, _TEXT
// There are arbitrary callers passing arguments with arbitrary signatures.
// Custom calling convention:
// tp pointer to the current thunk's data block (data contains 2 pointer values: context + target pointers)

mv t2, a0
INLINE_GET_TLS_VAR t0, C_FUNC(tls_thunkData)
mv a0, t2

// t0 = base address of TLS data
// tp = address of context cell in thunk's data
// t1 = address of context cell in thunk's data

// Load the thunk address from the data block and store it in the thread's static storage
ld t1, 0(t0) // Load thunk address into t1 from the TLS base address
sd t1, 0(t0) // Store the thunk address in thread static storage
ld t2, 0(t1) // Load thunk data into t2
sd t2, 0(t0) // Store the thunk address in thread static storage

// Load the target address from the data block and jump to it
ld t1, POINTER_SIZE(t0) // Load target address into t1 from the data block
jalr t1 // Jump to the target address in t1
ld t1, POINTER_SIZE(t1) // Load target address into t1 from the data block
jr t1 // Jump to the target address in t1

LEAF_END RhCommonStub, _TEXT

Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/nativeaot/Runtime/riscv64/PInvoke.S
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
NESTED_ENTRY RhpPInvoke, _TEXT, NoHandler
sd fp, OFFSETOF__PInvokeTransitionFrame__m_FramePointer(a0)
sd ra, OFFSETOF__PInvokeTransitionFrame__m_RIP(a0)
sd t0, OFFSETOF__PInvokeTransitionFrame__m_PreservedRegs(a0)
sd sp, OFFSETOF__PInvokeTransitionFrame__m_PreservedRegs(a0)
li t0, PTFF_SAVE_SP
sd t0, OFFSETOF__PInvokeTransitionFrame__m_Flags(a0)

Expand All @@ -40,7 +40,7 @@ LEAF_ENTRY RhpPInvokeReturn, _TEXT

PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, t0

bnez t1, 0f // If TrapThreadsFlags_None is non-zero, branch
bnez t0, 0f // If TrapThreadsFlags_None is non-zero, branch
ret

0:
Expand Down
17 changes: 5 additions & 12 deletions src/coreclr/nativeaot/Runtime/riscv64/UniversalTransition.S
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,12 @@
.global RhpFpTrashValues
#endif // TRASH_SAVED_ARGUMENT_REGISTERS

// Padding to account for the odd number of saved integer registers
#define ALIGNMENT_PADDING_SIZE (8)

#define COUNT_ARG_REGISTERS (8)
#define INTEGER_REGISTER_SIZE (8)
#define ARGUMENT_REGISTERS_SIZE (COUNT_ARG_REGISTERS * INTEGER_REGISTER_SIZE)

// Largest return block is 4 doubles
#define RETURN_BLOCK_SIZE (32)
#define RETURN_BLOCK_SIZE 16

#define COUNT_FLOAT_ARG_REGISTERS (8)
#define FLOAT_REGISTER_SIZE (8)
Expand All @@ -31,7 +28,6 @@

// From CallerSP to ChildSP, the stack frame is composed of the following adjacent regions:
//
// ALIGNMENT_PADDING_SIZE
// ARGUMENT_REGISTERS_SIZE
// RETURN_BLOCK_SIZE
// FLOAT_ARG_REGISTERS_SIZE
Expand All @@ -41,7 +37,7 @@

#define DISTANCE_FROM_CHILDSP_TO_RETURN_BLOCK (PUSHED_FP_SIZE + PUSHED_RA_SIZE + FLOAT_ARG_REGISTERS_SIZE)

#define STACK_SIZE (ALIGNMENT_PADDING_SIZE + ARGUMENT_REGISTERS_SIZE + RETURN_BLOCK_SIZE + FLOAT_ARG_REGISTERS_SIZE + PUSHED_RA_SIZE + PUSHED_FP_SIZE)
#define STACK_SIZE (ARGUMENT_REGISTERS_SIZE + RETURN_BLOCK_SIZE + FLOAT_ARG_REGISTERS_SIZE + PUSHED_RA_SIZE + PUSHED_FP_SIZE)

#define FLOAT_ARG_OFFSET (PUSHED_FP_SIZE + PUSHED_RA_SIZE)
#define ARGUMENT_REGISTERS_OFFSET (FLOAT_ARG_OFFSET + FLOAT_ARG_REGISTERS_SIZE + RETURN_BLOCK_SIZE)
Expand All @@ -63,9 +59,8 @@
// Frame layout is:
//
// {StackPassedArgs} ChildSP+100 CallerSP+000
// {AlignmentPad (0x8 bytes)} ChildSP+0F8 CallerSP-008
// {IntArgRegs (a0-a7) (0x40 bytes)} ChildSP+0B8 CallerSP-048
// {ReturnBlock (0x20 bytes)} ChildSP+098 CallerSP-068
// {ReturnBlock (0x10 bytes)} ChildSP+098 CallerSP-068
// -- The base address of the Return block is the TransitionBlock pointer, the floating point args are
// in the neg space of the TransitionBlock pointer. Note that the callee has knowledge of the exact
// layout of all pieces of the frame that lie at or above the pushed floating point registers.
Expand All @@ -91,9 +86,7 @@
NESTED_ENTRY Rhp\FunctionName, _TEXT, NoHandler

# FP and RA registers
addi sp, sp, -STACK_SIZE
sd s0, 0x0(sp) # Save frame pointer
sd ra, 0x08(sp) # Save return address
PROLOG_SAVE_REG_PAIR_INDEXED fp, ra, STACK_SIZE

# Floating point registers
fsd fa0, FLOAT_ARG_OFFSET(sp)
Expand All @@ -105,7 +98,7 @@
fsd fa6, FLOAT_ARG_OFFSET + 0x30(sp)
fsd fa7, FLOAT_ARG_OFFSET + 0x38(sp)

# Space for return buffer data (0x40 bytes)
# Space for return block data (0x10 bytes)

# Save argument registers
sd a0, ARGUMENT_REGISTERS_OFFSET(sp)
Expand Down
Loading

0 comments on commit 2129f3f

Please sign in to comment.