FEX-Emu · pmatos · Sep 17, 2025 · Sep 17, 2025
diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
@@ -775,7 +775,6 @@ class OpDispatchBuilder final : public IREmitter {
   void FCOMIF64(OpcodeArgs, IR::OpSize width, bool Integer, FCOMIFlags whichflags, bool poptwice);
   void FDIVF64(OpcodeArgs, IR::OpSize Width, bool Integer, bool Reverse, OpResult ResInST0);
   void FILDF64(OpcodeArgs);
-  void FISTF64(OpcodeArgs, bool Truncate);
   void FLDF64_Const(OpcodeArgs, uint64_t Num);
   void FLDF64(OpcodeArgs, IR::OpSize Width);
   void FMULF64(OpcodeArgs, IR::OpSize Width, bool Integer, OpResult ResInST0);
@@ -2430,6 +2429,7 @@ class OpDispatchBuilder final : public IREmitter {
     }
   }
 
+
   Ref _LoadMemAutoTSO(FEXCore::IR::RegisterClassType Class, IR::OpSize Size, AddressMode A, IR::OpSize Align = IR::OpSize::i8Bit) {
     bool AtomicTSO = IsTSOEnabled(Class) && !A.NonTSO;
     A = SelectAddressMode(this, A, GetGPROpSize(), CTX->HostFeatures.SupportsTSOImm9, AtomicTSO, Class != GPRClass, Size);

diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp
@@ -72,7 +72,7 @@ void OpDispatchBuilder::FLD(OpcodeArgs, IR::OpSize Width) {
   _PushStack(ConvertedData, Data, ReadWidth, true);
 }
 
-// Float LoaD operation with memory operand
+// Float LoaD operation with stack operand
 void OpDispatchBuilder::FLDFromStack(OpcodeArgs) {
   _CopyPushStack(Op->OP & 7);
 }
@@ -132,7 +132,7 @@ void OpDispatchBuilder::FST(OpcodeArgs, IR::OpSize Width) {
   AddressMode A = DecodeAddress(Op, Op->Dest, MemoryAccessType::DEFAULT, false);
 
   A = SelectAddressMode(this, A, GetGPROpSize(), CTX->HostFeatures.SupportsTSOImm9, false, false, Width);
-  _StoreStackMem(SourceSize, Width, A.Base, A.Index, OpSize::iInvalid, A.IndexType, A.IndexScale, /*Float=*/true);
+  _StoreStackMem(SourceSize, Width, A.Base, A.Index, OpSize::iInvalid, A.IndexType, A.IndexScale);
 
   if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) {
     _PopStackDestroy();
@@ -153,34 +153,11 @@ void OpDispatchBuilder::FSTToStack(OpcodeArgs) {
 // Store integer to memory (possibly with truncation)
 void OpDispatchBuilder::FIST(OpcodeArgs, bool Truncate) {
   const auto Size = OpSizeFromSrc(Op);
-  Ref Data = _ReadStackValue(0);
-
-  // For 16-bit integers, we need to manually check for overflow
-  // since _F80CVTInt doesn't handle 16-bit overflow detection properly
-  if (Size == OpSize::i16Bit) {
-    // Extract the 80-bit float value to check for special cases
-    // Get the upper 64 bits which contain sign and exponent and then the exponent from upper.
-    Ref Upper = _VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, Data, 1);
-    Ref Exponent = _And(OpSize::i64Bit, Upper, Constant(0x7fff));
-
-    // Check for NaN/Infinity: exponent = 0x7fff
-    SaveNZCV();
-    _TestNZ(OpSize::i64Bit, Exponent, Constant(0x7fff));
-    Ref IsSpecial = _NZCVSelect01({COND_EQ});
-
-    // For overflow detection, check if exponent indicates a value >= 2^15
-    // Biased exponent for 2^15 is 0x3fff + 15 = 0x400e
-    SubWithFlags(OpSize::i64Bit, Exponent, 0x400e);
-    Ref IsOverflow = _NZCVSelect01({COND_UGE});
-
-    // Set Invalid Operation flag if overflow or special value
-    Ref InvalidFlag = _Or(OpSize::i64Bit, IsSpecial, IsOverflow);
-    SetRFLAG<FEXCore::X86State::X87FLAG_IE_LOC>(InvalidFlag);
-  }
-
-  Data = _F80CVTInt(Size, Data, Truncate);
+  const auto SourceSize = ReducedPrecisionMode ? OpSize::i64Bit : OpSize::i128Bit;
+  AddressMode A = DecodeAddress(Op, Op->Dest, MemoryAccessType::DEFAULT, false);
 
-  StoreResult_WithOpSize(GPRClass, Op, Op->Dest, Data, Size, OpSize::i8Bit);
+  A = SelectAddressMode(this, A, GetGPROpSize(), CTX->HostFeatures.SupportsTSOImm9, false, false, Size);
+  _StoreStackMemInt(SourceSize, Size, A.Base, A.Index, OpSize::iInvalid, A.IndexType, A.IndexScale, Truncate);
 
   if ((Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) != 0) {
     _PopStackDestroy();

diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp
@@ -103,21 +103,6 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) {
   _PushStack(ConvertedData, Data, ReadWidth, false);
 }
 
-void OpDispatchBuilder::FISTF64(OpcodeArgs, bool Truncate) {
-  const auto Size = OpSizeFromSrc(Op);
-
-  Ref data = _ReadStackValue(0);
-  if (Truncate) {
-    data = _Float_ToGPR_ZS(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data);
-  } else {
-    data = _Float_ToGPR_S(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data);
-  }
-  StoreResult_WithOpSize(GPRClass, Op, Op->Dest, data, Size, OpSize::i8Bit);
-
-  if ((Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) != 0) {
-    _PopStackDestroy();
-  }
-}
 
 void OpDispatchBuilder::FADDF64(OpcodeArgs, IR::OpSize Width, bool Integer, OpDispatchBuilder::OpResult ResInST0) {
   if (Op->Src[0].IsNone()) { // Implicit argument case

diff --git a/FEXCore/Source/Interface/Core/X86Tables/X87Tables.cpp b/FEXCore/Source/Interface/Core/X86Tables/X87Tables.cpp
@@ -140,11 +140,11 @@ constexpr std::array<DispatchTableEntry, 133> X87F64OpTable = {{
 
   {OPDReg(0xDB, 0) | 0x00, 8, &OpDispatchBuilder::FILDF64},
 
-  {OPDReg(0xDB, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, true>},
+  {OPDReg(0xDB, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, true>},
 
-  {OPDReg(0xDB, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>},
+  {OPDReg(0xDB, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>},
 
-  {OPDReg(0xDB, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>},
+  {OPDReg(0xDB, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>},
 
   // 4 = Invalid
 
@@ -201,7 +201,7 @@ constexpr std::array<DispatchTableEntry, 133> X87F64OpTable = {{
 
   {OPDReg(0xDD, 0) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FLDF64, OpSize::i64Bit>},
 
-  {OPDReg(0xDD, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, true>},
+  {OPDReg(0xDD, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, true>},
 
   {OPDReg(0xDD, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FST, OpSize::i64Bit>},
 
@@ -256,19 +256,19 @@ constexpr std::array<DispatchTableEntry, 133> X87F64OpTable = {{
 
   {OPDReg(0xDF, 0) | 0x00, 8, &OpDispatchBuilder::FILDF64},
 
-  {OPDReg(0xDF, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, true>},
+  {OPDReg(0xDF, 1) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, true>},
 
-  {OPDReg(0xDF, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>},
+  {OPDReg(0xDF, 2) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>},
 
-  {OPDReg(0xDF, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>},
+  {OPDReg(0xDF, 3) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>},
 
   {OPDReg(0xDF, 4) | 0x00, 8, &OpDispatchBuilder::FBLDF64},
 
   {OPDReg(0xDF, 5) | 0x00, 8, &OpDispatchBuilder::FILDF64},
 
   {OPDReg(0xDF, 6) | 0x00, 8, &OpDispatchBuilder::FBSTPF64},
 
-  {OPDReg(0xDF, 7) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FISTF64, false>},
+  {OPDReg(0xDF, 7) | 0x00, 8, &OpDispatchBuilder::Bind<&OpDispatchBuilder::FIST, false>},
 
   // XXX: This should also set the x87 tag bits to empty
   // We don't support this currently, so just pop the stack

diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json
@@ -2852,13 +2852,21 @@
         "HasSideEffects": true,
         "X87": true
       },
-      "StoreStackMem OpSize:$SourceSize, OpSize:$StoreSize, GPR:$Addr, GPR:$Offset, OpSize:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale, i1:$Float": {
+      "StoreStackMem OpSize:$SourceSize, OpSize:$StoreSize, GPR:$Addr, GPR:$Offset, OpSize:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale": {
         "Desc": [
-          "Takes the top value off the x87 stack and stores it to memory.",
+          "Takes the top value off the x87 stack and stores it to memory as a Float.",
           "SourceSize is 128bit for F80 values, 64-bit for low precision.",
-          "StoreSize is the store size for conversion:",
-          "Float: 80-bit, 64-bit, or 32-bit",
-          "Int: 64-bit, 32-bit, 16-bit"
+          "StoreSize is the store size for conversion: 80-bit, 64-bit, or 32-bit"
+        ],
+        "HasSideEffects": true,
+        "X87": true
+      },
+      "StoreStackMemInt OpSize:$SourceSize, OpSize:$StoreSize, GPR:$Addr, GPR:$Offset, OpSize:$Align, MemOffsetType:$OffsetType, u8:$OffsetScale, i1:$Truncate": {
+        "Desc": [
+          "Takes the top value off the x87 stack and stores it to memory as an Integer.",
+          "SourceSize is 128bit for F80 values, 64-bit for low precision.",
+          "StoreSize is the store size for conversion: 64-bit, 32-bit, 16-bit",
+          "Truncate: If true, we truncate the value instead of rounding it."
         ],
         "HasSideEffects": true,
         "X87": true

diff --git a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp
@@ -8,6 +8,7 @@
 #include "FEXCore/Utils/Profiler.h"
 #include "FEXCore/Utils/MathUtils.h"
 #include "FEXCore/Core/HostFeatures.h"
+#include "FEXCore/Core/CoreState.h"
 #include "Interface/Core/Addressing.h"
 
 #include <array>
@@ -199,7 +200,7 @@ class X87StackOptimization final : public Pass {
     MemOffsetType OffsetType = Op->OffsetType;
     uint8_t OffsetScale = Op->OffsetScale;
 
-    // Normal Precision Mode
+    // Normal Precision Mode - Float stores
     switch (Op->StoreSize) {
     case OpSize::i32Bit:
     case OpSize::i64Bit: {
@@ -235,6 +236,7 @@ class X87StackOptimization final : public Pass {
     MemOffsetType OffsetType = Op->OffsetType;
     uint8_t OffsetScale = Op->OffsetScale;
 
+    // Reduced Precision Mode - Float stores
     switch (Op->StoreSize) {
     case OpSize::i32Bit: {
       StackNode = IREmit->_Float_FToF(OpSize::i32Bit, OpSize::i64Bit, StackNode);
@@ -995,8 +997,7 @@ void X87StackOptimization::Run(IREmitter* Emit) {
         // or similar. As long as the source size and dest size are one and the same.
         // This will avoid any conversions between source and stack element size and conversion back.
         if (!SlowPath && Value->Source && Value->Source->Size == Op->StoreSize && Value->InterpretAsFloat) {
-          IREmit->_StoreMem(Value->InterpretAsFloat ? FPRClass : GPRClass, Op->StoreSize, Value->Source->Node, AddrNode, Offset, Align,
-                            OffsetType, OffsetScale);
+          IREmit->_StoreMem(FPRClass, Op->StoreSize, Value->Source->Node, AddrNode, Offset, Align, OffsetType, OffsetScale);
           break;
         }
 
@@ -1009,6 +1010,68 @@ void X87StackOptimization::Run(IREmitter* Emit) {
         break;
       }
 
+      case OP_STORESTACKMEMINT: {
+        const auto* Op = IROp->C<IROp_StoreStackMemInt>();
+        const auto& Value = MigrateToSlowPath_IfInvalid();
+        Ref StackNode = SlowPath ? LoadStackValueAtOffset_Slow() : Value->StackDataNode;
+        Ref AddrNode = CurrentIR.GetNode(Op->Addr);
+        Ref Offset = CurrentIR.GetNode(Op->Offset);
+        OpSize Align = Op->Align;
+        MemOffsetType OffsetType = Op->OffsetType;
+        uint8_t OffsetScale = Op->OffsetScale;
+        bool Truncate = Op->Truncate;
+
+        // Similarly, optimize integer memcpy
+        if (!SlowPath && Value->Source && Value->Source->Size == Op->StoreSize && !Value->InterpretAsFloat) {
+          IREmit->_StoreMem(GPRClass, Op->StoreSize, Value->Source->Node, AddrNode, Offset, Align, OffsetType, OffsetScale);
+          break;
+        }
+
+        if (ReducedPrecisionMode) {
+          // Integer store in reduced precision mode - use Float_ToGPR conversions
+          const auto Size = Op->StoreSize;
+          Ref data = StackNode;
+          if (Truncate) {
+            data = IREmit->_Float_ToGPR_ZS(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data);
+          } else {
+            data = IREmit->_Float_ToGPR_S(Size == OpSize::i32Bit ? OpSize::i32Bit : OpSize::i64Bit, OpSize::i64Bit, data);
+          }
+          IREmit->_StoreMem(GPRClass, Size, data, AddrNode, Offset, Align, OffsetType, OffsetScale);
+        } else {
+          // Normal precision mode - use F80CVTInt for conversion
+          const auto Size = Op->StoreSize;
+
+          // For 16-bit integers, we need to manually check for overflow
+          // since _F80CVTInt doesn't handle 16-bit overflow detection properly
+          if (Size == OpSize::i16Bit) {
+            // Extract the 80-bit float value to check for special cases
+            // Get the upper 64 bits which contain sign and exponent and then the exponent from upper.
+            Ref Upper = IREmit->_VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, StackNode, 1);
+            Ref Exponent = IREmit->_And(OpSize::i64Bit, Upper, IREmit->_Constant(0x7fff));
+
+            // Check for NaN/Infinity: exponent = 0x7fff
+            // We need to manually track NZCV since we can't use SaveNZCV from optimization pass
+            IREmit->_TestNZ(OpSize::i64Bit, Exponent, IREmit->_Constant(0x7fff));
+            Ref IsSpecial = IREmit->_NZCVSelect01({COND_EQ});
+
+            // For overflow detection, check if exponent indicates a value >= 2^15
+            // Biased exponent for 2^15 is 0x3fff + 15 = 0x400e
+            IREmit->SubWithFlags(OpSize::i64Bit, Exponent, IREmit->_Constant(0x400e));
+            Ref IsOverflow = IREmit->_NZCVSelect01({COND_UGE});
+
+            // Set Invalid Operation flag if overflow or special value
+            Ref InvalidFlag = IREmit->_Or(OpSize::i64Bit, IsSpecial, IsOverflow);
+            // Store to the x87 flag context location
+            IREmit->_StoreContext(OpSize::i8Bit, GPRClass, InvalidFlag, offsetof(FEXCore::Core::CPUState, flags) + FEXCore::X86State::X87FLAG_IE_LOC);
+          }
+
+          Ref data = IREmit->_F80CVTInt(Size, StackNode, Truncate);
+          IREmit->_StoreMem(GPRClass, Size, data, AddrNode, Offset, Align, OffsetType, OffsetScale);
+        }
+
+        break;
+      }
+
       case OP_STORESTACKTOSTACK: { // stores top of stack in another place in stack.
         const auto* Op = IROp->C<IROp_StoreStackToStack>();
         auto Offset = Op->StackLocation;

diff --git a/unittests/ASM/Known_Failures b/unittests/ASM/Known_Failures
@@ -1 +1,4 @@
 Test_X87/D9_F8.asm
+
+# Block of size 1 doesn't work with this optimization
+jit_1/Test_64Bit_X87_F64/Memcopy_int_F64.asm
diff --git a/unittests/ASM/X87_F64/Memcopy_int_F64.asm b/unittests/ASM/X87_F64/Memcopy_int_F64.asm
@@ -0,0 +1,39 @@
+%ifdef CONFIG
+{
+  "RegData": {
+    "RAX": "0x1234567890ABCDEF",
+    "RBX": "0x1234567890ABCDEF",
+    "RCX": "0x9876543210FEDCBA",
+    "RSI": "0x9876543210FEDCBA"
+  },
+  "Env": { "FEX_X87REDUCEDPRECISION" : "1" }
+}
+%endif
+
+; Test integer memcpy optimization in reduced precision mode
+; Large 64-bit integers (> 2^53) should preserve precision when
+; fild/fistp operations are optimized to direct memory copy
+
+mov rdx, 0xe0000000
+
+; Test case 1: Large positive integer > 2^53
+; 0x1234567890ABCDEF = 1311768467463790319 > 2^53 = 9007199254740992
+mov rax, 0x1234567890ABCDEF
+mov [rdx + 0], rax
+
+fild qword [rdx + 0]
+fistp qword [rdx + 8]
+
+mov rbx, [rdx + 8]
+
+; Test case 2: Large negative integer
+; 0x9876543210FEDCBA as signed = -7508735094825308742
+mov rcx, 0x9876543210FEDCBA
+mov [rdx + 16], rcx
+
+fild qword [rdx + 16]
+fistp qword [rdx + 24]
+
+mov rsi, [rdx + 24]
+
+hlt