Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -775,7 +775,6 @@ class OpDispatchBuilder final : public IREmitter {
void FCOMIF64(OpcodeArgs, IR::OpSize width, bool Integer, FCOMIFlags whichflags, bool poptwice);
void FDIVF64(OpcodeArgs, IR::OpSize Width, bool Integer, bool Reverse, OpResult ResInST0);
void FILDF64(OpcodeArgs);
void FISTF64(OpcodeArgs, bool Truncate);
void FLDF64_Const(OpcodeArgs, uint64_t Num);
void FLDF64(OpcodeArgs, IR::OpSize Width);
void FMULF64(OpcodeArgs, IR::OpSize Width, bool Integer, OpResult ResInST0);
Expand Down Expand Up @@ -2430,6 +2429,7 @@ class OpDispatchBuilder final : public IREmitter {
}
}


Ref _LoadMemAutoTSO(FEXCore::IR::RegisterClassType Class, IR::OpSize Size, AddressMode A, IR::OpSize Align = IR::OpSize::i8Bit) {
bool AtomicTSO = IsTSOEnabled(Class) && !A.NonTSO;
A = SelectAddressMode(this, A, GetGPROpSize(), CTX->HostFeatures.SupportsTSOImm9, AtomicTSO, Class != GPRClass, Size);
Expand Down
35 changes: 6 additions & 29 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ void OpDispatchBuilder::FLD(OpcodeArgs, IR::OpSize Width) {
_PushStack(ConvertedData, Data, ReadWidth, true);
}

// Float LoaD operation with memory operand
// Float LoaD operation with stack operand
void OpDispatchBuilder::FLDFromStack(OpcodeArgs) {
_CopyPushStack(Op->OP & 7);
}
Expand Down Expand Up @@ -132,7 +132,7 @@ void OpDispatchBuilder::FST(OpcodeArgs, IR::OpSize Width) {
AddressMode A = DecodeAddress(Op, Op->Dest, MemoryAccessType::DEFAULT, false);

A = SelectAddressMode(this, A, GetGPROpSize(), CTX->HostFeatures.SupportsTSOImm9, false, false, Width);
_StoreStackMem(SourceSize, Width, A.Base, A.Index, OpSize::iInvalid, A.IndexType, A.IndexScale, /*Float=*/true);
_StoreStackMem(SourceSize, Width, A.Base, A.Index, OpSize::iInvalid, A.IndexType, A.IndexScale);

if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) {
_PopStackDestroy();
Expand All @@ -153,34 +153,11 @@ void OpDispatchBuilder::FSTToStack(OpcodeArgs) {
// Store integer to memory (possibly with truncation)
void OpDispatchBuilder::FIST(OpcodeArgs, bool Truncate) {
const auto Size = OpSizeFromSrc(Op);
Ref Data = _ReadStackValue(0);

// For 16-bit integers, we need to manually check for overflow
// since _F80CVTInt doesn't handle 16-bit overflow detection properly
if (Size == OpSize::i16Bit) {
// Extract the 80-bit float value to check for special cases
// Get the upper 64 bits which contain sign and exponent and then the exponent from upper.
Ref Upper = _VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, Data, 1);
Ref Exponent = _And(OpSize::i64Bit, Upper, Constant(0x7fff));

// Check for NaN/Infinity: exponent = 0x7fff
SaveNZCV();
_TestNZ(OpSize::i64Bit, Exponent, Constant(0x7fff));
Ref IsSpecial = _NZCVSelect01({COND_EQ});

// For overflow detection, check if exponent indicates a value >= 2^15
// Biased exponent for 2^15 is 0x3fff + 15 = 0x400e
SubWithFlags(OpSize::i64Bit, Exponent, 0x400e);
Ref IsOverflow = _NZCVSelect01({COND_UGE});

// Set Invalid Operation flag if overflow or special value
Ref InvalidFlag = _Or(OpSize::i64Bit, IsSpecial, IsOverflow);
SetRFLAG<FEXCore::X86State::X87FLAG_IE_LOC>(InvalidFlag);
}

Data = _F80CVTInt(Size, Data, Truncate);
const auto SourceSize = ReducedPrecisionMode ? OpSize::i64Bit : OpSize::i128Bit;
AddressMode A = DecodeAddress(Op, Op->Dest, MemoryAccessType::DEFAULT, false);

StoreResult_WithOpSize(GPRClass, Op, Op->Dest, Data, Size, OpSize::i8Bit);
A = SelectAddressMode(this, A, GetGPROpSize(), CTX->HostFeatures.SupportsTSOImm9, false, false, Size);
_StoreStackMemInt(SourceSize, Size, A.Base, A.Index, OpSize::iInvalid, A.IndexType, A.IndexScale, Truncate);

if ((Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) != 0) {
_PopStackDestroy();
Expand Down
15 changes: 0 additions & 15 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,21 +103,6 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) {
_PushStack(ConvertedData, Data, ReadWidth, false);
}

void OpDispatchBuilder::FISTF64(OpcodeArgs, bool Truncate) {
const auto Size = OpSizeFromSrc(Op);

Ref data = _ReadStackValue(0);
if (Truncate) {
data = _Float_ToGPR_ZS(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data);
} else {
data = _Float_ToGPR_S(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data);
}
StoreResult_WithOpSize(GPRClass, Op, Op->Dest, data, Size, OpSize::i8Bit);

if ((Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) != 0) {
_PopStackDestroy();
}
}

void OpDispatchBuilder::FADDF64(OpcodeArgs, IR::OpSize Width, bool Integer, OpDispatchBuilder::OpResult ResInST0) {
if (Op->Src[0].IsNone()) { // Implicit argument case
Expand Down
16 changes: 8 additions & 8 deletions FEXCore/Source/Interface/Core/X86Tables/X87Tables.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,11 @@ constexpr std::array<DispatchTableEntry, 133> X87F64OpTable = {{

{OPDReg(0xDB, 0) | 0x00, 8, &OpDispatchBuilder::FILDF64},

{OPDReg(0xDB, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, true>},
{OPDReg(0xDB, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, true>},

{OPDReg(0xDB, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>},
{OPDReg(0xDB, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>},

{OPDReg(0xDB, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>},
{OPDReg(0xDB, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>},

// 4 = Invalid

Expand Down Expand Up @@ -201,7 +201,7 @@ constexpr std::array<DispatchTableEntry, 133> X87F64OpTable = {{

{OPDReg(0xDD, 0) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLDF64, OpSize::i64Bit>},

{OPDReg(0xDD, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, true>},
{OPDReg(0xDD, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, true>},

{OPDReg(0xDD, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FST, OpSize::i64Bit>},

Expand Down Expand Up @@ -256,19 +256,19 @@ constexpr std::array<DispatchTableEntry, 133> X87F64OpTable = {{

{OPDReg(0xDF, 0) | 0x00, 8, &OpDispatchBuilder::FILDF64},

{OPDReg(0xDF, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, true>},
{OPDReg(0xDF, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, true>},

{OPDReg(0xDF, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>},
{OPDReg(0xDF, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>},

{OPDReg(0xDF, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>},
{OPDReg(0xDF, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>},

{OPDReg(0xDF, 4) | 0x00, 8, &OpDispatchBuilder::FBLDF64},

{OPDReg(0xDF, 5) | 0x00, 8, &OpDispatchBuilder::FILDF64},

{OPDReg(0xDF, 6) | 0x00, 8, &OpDispatchBuilder::FBSTPF64},

{OPDReg(0xDF, 7) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>},
{OPDReg(0xDF, 7) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>},

// XXX: This should also set the x87 tag bits to empty
// We don't support this currently, so just pop the stack
Expand Down
18 changes: 13 additions & 5 deletions FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -2852,13 +2852,21 @@
"HasSideEffects": true,
"X87": true
},
"StoreStackMem OpSize:$SourceSize, OpSize:$StoreSize, GPR:$Addr, GPR:$Offset, OpSize:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale, i1:$Float": {
"StoreStackMem OpSize:$SourceSize, OpSize:$StoreSize, GPR:$Addr, GPR:$Offset, OpSize:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale": {
"Desc": [
"Takes the top value off the x87 stack and stores it to memory.",
"Takes the top value off the x87 stack and stores it to memory as a Float.",
"SourceSize is 128bit for F80 values, 64-bit for low precision.",
"StoreSize is the store size for conversion:",
"Float: 80-bit, 64-bit, or 32-bit",
"Int: 64-bit, 32-bit, 16-bit"
"StoreSize is the store size for conversion: 80-bit, 64-bit, or 32-bit"
],
"HasSideEffects": true,
"X87": true
},
"StoreStackMemInt OpSize:$SourceSize, OpSize:$StoreSize, GPR:$Addr, GPR:$Offset, OpSize:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale, i1:$Truncate": {
"Desc": [
"Takes the top value off the x87 stack and stores it to memory as an Integer.",
"SourceSize is 128bit for F80 values, 64-bit for low precision.",
"StoreSize is the store size for conversion: 64-bit, 32-bit, 16-bit",
"Truncate: If true, we truncate the value instead of rounding it."
],
"HasSideEffects": true,
"X87": true
Expand Down
69 changes: 66 additions & 3 deletions FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "FEXCore/Utils/Profiler.h"
#include "FEXCore/Utils/MathUtils.h"
#include "FEXCore/Core/HostFeatures.h"
#include "FEXCore/Core/CoreState.h"
#include "Interface/Core/Addressing.h"

#include <array>
Expand Down Expand Up @@ -199,7 +200,7 @@ class X87StackOptimization final : public Pass {
MemOffsetType OffsetType = Op->OffsetType;
uint8_t OffsetScale = Op->OffsetScale;

// Normal Precision Mode
// Normal Precision Mode - Float stores
switch (Op->StoreSize) {
case OpSize::i32Bit:
case OpSize::i64Bit: {
Expand Down Expand Up @@ -235,6 +236,7 @@ class X87StackOptimization final : public Pass {
MemOffsetType OffsetType = Op->OffsetType;
uint8_t OffsetScale = Op->OffsetScale;

// Reduced Precision Mode - Float stores
switch (Op->StoreSize) {
case OpSize::i32Bit: {
StackNode = IREmit->_Float_FToF(OpSize::i32Bit, OpSize::i64Bit, StackNode);
Expand Down Expand Up @@ -995,8 +997,7 @@ void X87StackOptimization::Run(IREmitter* Emit) {
// or similar. As long as the source size and dest size are one and the same.
// This will avoid any conversions between source and stack element size and conversion back.
if (!SlowPath && Value->Source && Value->Source->Size == Op->StoreSize && Value->InterpretAsFloat) {
IREmit->_StoreMem(Value->InterpretAsFloat ? FPRClass : GPRClass, Op->StoreSize, Value->Source->Node, AddrNode, Offset, Align,
OffsetType, OffsetScale);
IREmit->_StoreMem(FPRClass, Op->StoreSize, Value->Source->Node, AddrNode, Offset, Align, OffsetType, OffsetScale);
break;
}

Expand All @@ -1009,6 +1010,68 @@ void X87StackOptimization::Run(IREmitter* Emit) {
break;
}

case OP_STORESTACKMEMINT: {
const auto* Op = IROp->C<IROp_StoreStackMemInt>();
const auto& Value = MigrateToSlowPath_IfInvalid();
Ref StackNode = SlowPath ? LoadStackValueAtOffset_Slow() : Value->StackDataNode;
Ref AddrNode = CurrentIR.GetNode(Op->Addr);
Ref Offset = CurrentIR.GetNode(Op->Offset);
OpSize Align = Op->Align;
MemOffsetType OffsetType = Op->OffsetType;
uint8_t OffsetScale = Op->OffsetScale;
bool Truncate = Op->Truncate;

// Similarly, optimize integer memcpy
if (!SlowPath && Value->Source && Value->Source->Size == Op->StoreSize && !Value->InterpretAsFloat) {
IREmit->_StoreMem(GPRClass, Op->StoreSize, Value->Source->Node, AddrNode, Offset, Align, OffsetType, OffsetScale);
break;
}

if (ReducedPrecisionMode) {
// Integer store in reduced precision mode - use Float_ToGPR conversions
const auto Size = Op->StoreSize;
Ref data = StackNode;
if (Truncate) {
data = IREmit->_Float_ToGPR_ZS(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data);
} else {
data = IREmit->_Float_ToGPR_S(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data);
}
IREmit->_StoreMem(GPRClass, Size, data, AddrNode, Offset, Align, OffsetType, OffsetScale);
} else {
// Normal precision mode - use F80CVTInt for conversion
const auto Size = Op->StoreSize;

// For 16-bit integers, we need to manually check for overflow
// since _F80CVTInt doesn't handle 16-bit overflow detection properly
if (Size == OpSize::i16Bit) {
// Extract the 80-bit float value to check for special cases
// Get the upper 64 bits which contain sign and exponent and then the exponent from upper.
Ref Upper = IREmit->_VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, StackNode, 1);
Ref Exponent = IREmit->_And(OpSize::i64Bit, Upper, IREmit->_Constant(0x7fff));

// Check for NaN/Infinity: exponent = 0x7fff
// We need to manually track NZCV since we can't use SaveNZCV from optimization pass
IREmit->_TestNZ(OpSize::i64Bit, Exponent, IREmit->_Constant(0x7fff));
Ref IsSpecial = IREmit->_NZCVSelect01({COND_EQ});

// For overflow detection, check if exponent indicates a value >= 2^15
// Biased exponent for 2^15 is 0x3fff + 15 = 0x400e
IREmit->SubWithFlags(OpSize::i64Bit, Exponent, IREmit->_Constant(0x400e));
Ref IsOverflow = IREmit->_NZCVSelect01({COND_UGE});

// Set Invalid Operation flag if overflow or special value
Ref InvalidFlag = IREmit->_Or(OpSize::i64Bit, IsSpecial, IsOverflow);
// Store to the x87 flag context location
IREmit->_StoreContext(OpSize::i8Bit, GPRClass, InvalidFlag, offsetof(FEXCore::Core::CPUState, flags) + FEXCore::X86State::X87FLAG_IE_LOC);
}

Ref data = IREmit->_F80CVTInt(Size, StackNode, Truncate);
IREmit->_StoreMem(GPRClass, Size, data, AddrNode, Offset, Align, OffsetType, OffsetScale);
}

break;
}

case OP_STORESTACKTOSTACK: { // stores top of stack in another place in stack.
const auto* Op = IROp->C<IROp_StoreStackToStack>();
auto Offset = Op->StackLocation;
Expand Down
3 changes: 3 additions & 0 deletions unittests/ASM/Known_Failures
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
Test_X87/D9_F8.asm

# Block of size 1 doesn't work with this optimization
jit_1/Test_64Bit_X87_F64/Memcopy_int_F64.asm
39 changes: 39 additions & 0 deletions unittests/ASM/X87_F64/Memcopy_int_F64.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
%ifdef CONFIG
{
"RegData": {
"RAX": "0x1234567890ABCDEF",
"RBX": "0x1234567890ABCDEF",
"RCX": "0x9876543210FEDCBA",
"RSI": "0x9876543210FEDCBA"
},
"Env": { "FEX_X87REDUCEDPRECISION" : "1" }
}
%endif

; Test integer memcpy optimization in reduced precision mode
; Large 64-bit integers (> 2^53) should preserve precision when
; fild/fistp operations are optimized to direct memory copy

mov rdx, 0xe0000000

; Test case 1: Large positive integer > 2^53
; 0x1234567890ABCDEF = 1311768467463790319 > 2^53 = 9007199254740992
mov rax, 0x1234567890ABCDEF
mov [rdx + 0], rax

fild qword [rdx + 0]
fistp qword [rdx + 8]

mov rbx, [rdx + 8]

; Test case 2: Large negative integer
; 0x9876543210FEDCBA as signed = -7508735094825308742
mov rcx, 0x9876543210FEDCBA
mov [rdx + 16], rcx

fild qword [rdx + 16]
fistp qword [rdx + 24]

mov rsi, [rdx + 24]

hlt