diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 1d229e25bd..e60a5debae 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -775,7 +775,6 @@ class OpDispatchBuilder final : public IREmitter { void FCOMIF64(OpcodeArgs, IR::OpSize width, bool Integer, FCOMIFlags whichflags, bool poptwice); void FDIVF64(OpcodeArgs, IR::OpSize Width, bool Integer, bool Reverse, OpResult ResInST0); void FILDF64(OpcodeArgs); - void FISTF64(OpcodeArgs, bool Truncate); void FLDF64_Const(OpcodeArgs, uint64_t Num); void FLDF64(OpcodeArgs, IR::OpSize Width); void FMULF64(OpcodeArgs, IR::OpSize Width, bool Integer, OpResult ResInST0); @@ -2430,6 +2429,7 @@ class OpDispatchBuilder final : public IREmitter { } } + Ref _LoadMemAutoTSO(FEXCore::IR::RegisterClassType Class, IR::OpSize Size, AddressMode A, IR::OpSize Align = IR::OpSize::i8Bit) { bool AtomicTSO = IsTSOEnabled(Class) && !A.NonTSO; A = SelectAddressMode(this, A, GetGPROpSize(), CTX->HostFeatures.SupportsTSOImm9, AtomicTSO, Class != GPRClass, Size); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp index 87814b7d2f..969de9acd6 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp @@ -72,7 +72,7 @@ void OpDispatchBuilder::FLD(OpcodeArgs, IR::OpSize Width) { _PushStack(ConvertedData, Data, ReadWidth, true); } -// Float LoaD operation with memory operand +// Float LoaD operation with stack operand void OpDispatchBuilder::FLDFromStack(OpcodeArgs) { _CopyPushStack(Op->OP & 7); } @@ -132,7 +132,7 @@ void OpDispatchBuilder::FST(OpcodeArgs, IR::OpSize Width) { AddressMode A = DecodeAddress(Op, Op->Dest, MemoryAccessType::DEFAULT, false); A = SelectAddressMode(this, A, GetGPROpSize(), CTX->HostFeatures.SupportsTSOImm9, false, false, Width); - _StoreStackMem(SourceSize, Width, A.Base, A.Index, OpSize::iInvalid, A.IndexType, A.IndexScale, /*Float=*/true); + _StoreStackMem(SourceSize, Width, A.Base, A.Index, OpSize::iInvalid, A.IndexType, A.IndexScale); if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); @@ -153,34 +153,11 @@ void OpDispatchBuilder::FSTToStack(OpcodeArgs) { // Store integer to memory (possibly with truncation) void OpDispatchBuilder::FIST(OpcodeArgs, bool Truncate) { const auto Size = OpSizeFromSrc(Op); - Ref Data = _ReadStackValue(0); - - // For 16-bit integers, we need to manually check for overflow - // since _F80CVTInt doesn't handle 16-bit overflow detection properly - if (Size == OpSize::i16Bit) { - // Extract the 80-bit float value to check for special cases - // Get the upper 64 bits which contain sign and exponent and then the exponent from upper. - Ref Upper = _VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, Data, 1); - Ref Exponent = _And(OpSize::i64Bit, Upper, Constant(0x7fff)); - - // Check for NaN/Infinity: exponent = 0x7fff - SaveNZCV(); - _TestNZ(OpSize::i64Bit, Exponent, Constant(0x7fff)); - Ref IsSpecial = _NZCVSelect01({COND_EQ}); - - // For overflow detection, check if exponent indicates a value >= 2^15 - // Biased exponent for 2^15 is 0x3fff + 15 = 0x400e - SubWithFlags(OpSize::i64Bit, Exponent, 0x400e); - Ref IsOverflow = _NZCVSelect01({COND_UGE}); - - // Set Invalid Operation flag if overflow or special value - Ref InvalidFlag = _Or(OpSize::i64Bit, IsSpecial, IsOverflow); - SetRFLAG(InvalidFlag); - } - - Data = _F80CVTInt(Size, Data, Truncate); + const auto SourceSize = ReducedPrecisionMode ? OpSize::i64Bit : OpSize::i128Bit; + AddressMode A = DecodeAddress(Op, Op->Dest, MemoryAccessType::DEFAULT, false); - StoreResult_WithOpSize(GPRClass, Op, Op->Dest, Data, Size, OpSize::i8Bit); + A = SelectAddressMode(this, A, GetGPROpSize(), CTX->HostFeatures.SupportsTSOImm9, false, false, Size); + _StoreStackMemInt(SourceSize, Size, A.Base, A.Index, OpSize::iInvalid, A.IndexType, A.IndexScale, Truncate); if ((Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) != 0) { _PopStackDestroy(); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp index f3aa03e2d5..7882998f59 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp @@ -103,21 +103,6 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) { _PushStack(ConvertedData, Data, ReadWidth, false); } -void OpDispatchBuilder::FISTF64(OpcodeArgs, bool Truncate) { - const auto Size = OpSizeFromSrc(Op); - - Ref data = _ReadStackValue(0); - if (Truncate) { - data = _Float_ToGPR_ZS(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data); - } else { - data = _Float_ToGPR_S(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data); - } - StoreResult_WithOpSize(GPRClass, Op, Op->Dest, data, Size, OpSize::i8Bit); - - if ((Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) != 0) { - _PopStackDestroy(); - } -} void OpDispatchBuilder::FADDF64(OpcodeArgs, IR::OpSize Width, bool Integer, OpDispatchBuilder::OpResult ResInST0) { if (Op->Src[0].IsNone()) { // Implicit argument case diff --git a/FEXCore/Source/Interface/Core/X86Tables/X87Tables.cpp b/FEXCore/Source/Interface/Core/X86Tables/X87Tables.cpp index 1a6743b5f6..64a2a42735 100644 --- a/FEXCore/Source/Interface/Core/X86Tables/X87Tables.cpp +++ b/FEXCore/Source/Interface/Core/X86Tables/X87Tables.cpp @@ -140,11 +140,11 @@ constexpr std::array X87F64OpTable = {{ {OPDReg(0xDB, 0) | 0x00, 8, &OpDispatchBuilder::FILDF64}, - {OPDReg(0xDB, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, true>}, + {OPDReg(0xDB, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, true>}, - {OPDReg(0xDB, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>}, + {OPDReg(0xDB, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>}, - {OPDReg(0xDB, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>}, + {OPDReg(0xDB, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>}, // 4 = Invalid @@ -201,7 +201,7 @@ constexpr std::array X87F64OpTable = {{ {OPDReg(0xDD, 0) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLDF64, OpSize::i64Bit>}, - {OPDReg(0xDD, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, true>}, + {OPDReg(0xDD, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, true>}, {OPDReg(0xDD, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FST, OpSize::i64Bit>}, @@ -256,11 +256,11 @@ constexpr std::array X87F64OpTable = {{ {OPDReg(0xDF, 0) | 0x00, 8, &OpDispatchBuilder::FILDF64}, - {OPDReg(0xDF, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, true>}, + {OPDReg(0xDF, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, true>}, - {OPDReg(0xDF, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>}, + {OPDReg(0xDF, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>}, - {OPDReg(0xDF, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>}, + {OPDReg(0xDF, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>}, {OPDReg(0xDF, 4) | 0x00, 8, &OpDispatchBuilder::FBLDF64}, @@ -268,7 +268,7 @@ constexpr std::array X87F64OpTable = {{ {OPDReg(0xDF, 6) | 0x00, 8, &OpDispatchBuilder::FBSTPF64}, - {OPDReg(0xDF, 7) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>}, + {OPDReg(0xDF, 7) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>}, // XXX: This should also set the x87 tag bits to empty // We don't support this currently, so just pop the stack diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index 2c6729d6f1..d6460a328a 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -2852,13 +2852,21 @@ "HasSideEffects": true, "X87": true }, - "StoreStackMem OpSize:$SourceSize, OpSize:$StoreSize, GPR:$Addr, GPR:$Offset, OpSize:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale, i1:$Float": { + "StoreStackMem OpSize:$SourceSize, OpSize:$StoreSize, GPR:$Addr, GPR:$Offset, OpSize:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale": { "Desc": [ - "Takes the top value off the x87 stack and stores it to memory.", + "Takes the top value off the x87 stack and stores it to memory as a Float.", "SourceSize is 128bit for F80 values, 64-bit for low precision.", - "StoreSize is the store size for conversion:", - "Float: 80-bit, 64-bit, or 32-bit", - "Int: 64-bit, 32-bit, 16-bit" + "StoreSize is the store size for conversion: 80-bit, 64-bit, or 32-bit" + ], + "HasSideEffects": true, + "X87": true + }, + "StoreStackMemInt OpSize:$SourceSize, OpSize:$StoreSize, GPR:$Addr, GPR:$Offset, OpSize:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale, i1:$Truncate": { + "Desc": [ + "Takes the top value off the x87 stack and stores it to memory as an Integer.", + "SourceSize is 128bit for F80 values, 64-bit for low precision.", + "StoreSize is the store size for conversion: 64-bit, 32-bit, 16-bit", + "Truncate: If true, we truncate the value instead of rounding it." ], "HasSideEffects": true, "X87": true diff --git a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp index 6ecd1c10e0..f84e77ddb1 100644 --- a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp @@ -8,6 +8,7 @@ #include "FEXCore/Utils/Profiler.h" #include "FEXCore/Utils/MathUtils.h" #include "FEXCore/Core/HostFeatures.h" +#include "FEXCore/Core/CoreState.h" #include "Interface/Core/Addressing.h" #include @@ -199,7 +200,7 @@ class X87StackOptimization final : public Pass { MemOffsetType OffsetType = Op->OffsetType; uint8_t OffsetScale = Op->OffsetScale; - // Normal Precision Mode + // Normal Precision Mode - Float stores switch (Op->StoreSize) { case OpSize::i32Bit: case OpSize::i64Bit: { @@ -235,6 +236,7 @@ class X87StackOptimization final : public Pass { MemOffsetType OffsetType = Op->OffsetType; uint8_t OffsetScale = Op->OffsetScale; + // Reduced Precision Mode - Float stores switch (Op->StoreSize) { case OpSize::i32Bit: { StackNode = IREmit->_Float_FToF(OpSize::i32Bit, OpSize::i64Bit, StackNode); @@ -995,8 +997,7 @@ void X87StackOptimization::Run(IREmitter* Emit) { // or similar. As long as the source size and dest size are one and the same. // This will avoid any conversions between source and stack element size and conversion back. if (!SlowPath && Value->Source && Value->Source->Size == Op->StoreSize && Value->InterpretAsFloat) { - IREmit->_StoreMem(Value->InterpretAsFloat ? FPRClass : GPRClass, Op->StoreSize, Value->Source->Node, AddrNode, Offset, Align, - OffsetType, OffsetScale); + IREmit->_StoreMem(FPRClass, Op->StoreSize, Value->Source->Node, AddrNode, Offset, Align, OffsetType, OffsetScale); break; } @@ -1009,6 +1010,68 @@ void X87StackOptimization::Run(IREmitter* Emit) { break; } + case OP_STORESTACKMEMINT: { + const auto* Op = IROp->C(); + const auto& Value = MigrateToSlowPath_IfInvalid(); + Ref StackNode = SlowPath ? LoadStackValueAtOffset_Slow() : Value->StackDataNode; + Ref AddrNode = CurrentIR.GetNode(Op->Addr); + Ref Offset = CurrentIR.GetNode(Op->Offset); + OpSize Align = Op->Align; + MemOffsetType OffsetType = Op->OffsetType; + uint8_t OffsetScale = Op->OffsetScale; + bool Truncate = Op->Truncate; + + // Similarly, optimize integer memcpy + if (!SlowPath && Value->Source && Value->Source->Size == Op->StoreSize && !Value->InterpretAsFloat) { + IREmit->_StoreMem(GPRClass, Op->StoreSize, Value->Source->Node, AddrNode, Offset, Align, OffsetType, OffsetScale); + break; + } + + if (ReducedPrecisionMode) { + // Integer store in reduced precision mode - use Float_ToGPR conversions + const auto Size = Op->StoreSize; + Ref data = StackNode; + if (Truncate) { + data = IREmit->_Float_ToGPR_ZS(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data); + } else { + data = IREmit->_Float_ToGPR_S(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data); + } + IREmit->_StoreMem(GPRClass, Size, data, AddrNode, Offset, Align, OffsetType, OffsetScale); + } else { + // Normal precision mode - use F80CVTInt for conversion + const auto Size = Op->StoreSize; + + // For 16-bit integers, we need to manually check for overflow + // since _F80CVTInt doesn't handle 16-bit overflow detection properly + if (Size == OpSize::i16Bit) { + // Extract the 80-bit float value to check for special cases + // Get the upper 64 bits which contain sign and exponent and then the exponent from upper. + Ref Upper = IREmit->_VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, StackNode, 1); + Ref Exponent = IREmit->_And(OpSize::i64Bit, Upper, IREmit->_Constant(0x7fff)); + + // Check for NaN/Infinity: exponent = 0x7fff + // We need to manually track NZCV since we can't use SaveNZCV from optimization pass + IREmit->_TestNZ(OpSize::i64Bit, Exponent, IREmit->_Constant(0x7fff)); + Ref IsSpecial = IREmit->_NZCVSelect01({COND_EQ}); + + // For overflow detection, check if exponent indicates a value >= 2^15 + // Biased exponent for 2^15 is 0x3fff + 15 = 0x400e + IREmit->SubWithFlags(OpSize::i64Bit, Exponent, IREmit->_Constant(0x400e)); + Ref IsOverflow = IREmit->_NZCVSelect01({COND_UGE}); + + // Set Invalid Operation flag if overflow or special value + Ref InvalidFlag = IREmit->_Or(OpSize::i64Bit, IsSpecial, IsOverflow); + // Store to the x87 flag context location + IREmit->_StoreContext(OpSize::i8Bit, GPRClass, InvalidFlag, offsetof(FEXCore::Core::CPUState, flags) + FEXCore::X86State::X87FLAG_IE_LOC); + } + + Ref data = IREmit->_F80CVTInt(Size, StackNode, Truncate); + IREmit->_StoreMem(GPRClass, Size, data, AddrNode, Offset, Align, OffsetType, OffsetScale); + } + + break; + } + case OP_STORESTACKTOSTACK: { // stores top of stack in another place in stack. const auto* Op = IROp->C(); auto Offset = Op->StackLocation; diff --git a/unittests/ASM/Known_Failures b/unittests/ASM/Known_Failures index 6044e44d7f..c84b25394a 100644 --- a/unittests/ASM/Known_Failures +++ b/unittests/ASM/Known_Failures @@ -1 +1,4 @@ Test_X87/D9_F8.asm + +# Block of size 1 doesn't work with this optimization +jit_1/Test_64Bit_X87_F64/Memcopy_int_F64.asm \ No newline at end of file diff --git a/unittests/ASM/X87_F64/Memcopy_int_F64.asm b/unittests/ASM/X87_F64/Memcopy_int_F64.asm new file mode 100644 index 0000000000..16ddc53add --- /dev/null +++ b/unittests/ASM/X87_F64/Memcopy_int_F64.asm @@ -0,0 +1,39 @@ +%ifdef CONFIG +{ + "RegData": { + "RAX": "0x1234567890ABCDEF", + "RBX": "0x1234567890ABCDEF", + "RCX": "0x9876543210FEDCBA", + "RSI": "0x9876543210FEDCBA" + }, + "Env": { "FEX_X87REDUCEDPRECISION" : "1" } +} +%endif + +; Test integer memcpy optimization in reduced precision mode +; Large 64-bit integers (> 2^53) should preserve precision when +; fild/fistp operations are optimized to direct memory copy + +mov rdx, 0xe0000000 + +; Test case 1: Large positive integer > 2^53 +; 0x1234567890ABCDEF = 1311768467463790319 > 2^53 = 9007199254740992 +mov rax, 0x1234567890ABCDEF +mov [rdx + 0], rax + +fild qword [rdx + 0] +fistp qword [rdx + 8] + +mov rbx, [rdx + 8] + +; Test case 2: Large negative integer +; 0x9876543210FEDCBA as signed = -7508735094825308742 +mov rcx, 0x9876543210FEDCBA +mov [rdx + 16], rcx + +fild qword [rdx + 16] +fistp qword [rdx + 24] + +mov rsi, [rdx + 24] + +hlt \ No newline at end of file