[CIR] [Lowering] [X86_64] Support VAArg for LongDouble (#1150)

ChuanqiXu9 · web-flow · commit 2a95651bc1e3 · 2024-11-25T12:22:37.000-08:00
Recommit #1101 I am not sure what happened. But that merged PR doesn't show in the git log. Maybe the stacked PR may not get successed? But after all, we need to land it again. Following off are original commit messages: --- This is the following of #1100. After #1100, when we want to use LongDouble for VAArg, we will be in trouble due to details in X86_64's ABI and this patch tries to address this. The practical impact the patch is, after this patch, with #1088 and a small following up fix, we can build and run all C's benchmark in SpecCPU 2017. I think it is a milestone.
diff --git a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
@@ -745,7 +745,7 @@ const llvm::fltSemantics &FP80Type::getFloatSemantics() const {
 llvm::TypeSize
 FP80Type::getTypeSizeInBits(const mlir::DataLayout &dataLayout,
                             mlir::DataLayoutEntryListRef params) const {
-  return llvm::TypeSize::getFixed(16);
+  return llvm::TypeSize::getFixed(128);
 }
 
 uint64_t FP80Type::getABIAlignment(const mlir::DataLayout &dataLayout,
@@ -766,6 +766,7 @@ const llvm::fltSemantics &FP128Type::getFloatSemantics() const {
 llvm::TypeSize
 FP128Type::getTypeSizeInBits(const mlir::DataLayout &dataLayout,
                              mlir::DataLayoutEntryListRef params) const {
+  // FIXME: We probably want it to return 128. But we're lacking a test now.
   return llvm::TypeSize::getFixed(16);
 }
 
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.cpp
@@ -65,4 +65,11 @@ CIRCXXABI::RecordArgABI getRecordArgABI(const StructType RT,
   return CXXABI.getRecordArgABI(RT);
 }
 
+CIRCXXABI::RecordArgABI getRecordArgABI(mlir::Type ty, CIRCXXABI &CXXABI) {
+  auto st = mlir::dyn_cast<StructType>(ty);
+  if (!st)
+    return CIRCXXABI::RAA_Default;
+  return getRecordArgABI(st, CXXABI);
+}
+
 } // namespace cir
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.h
@@ -33,6 +33,7 @@ mlir::Value emitRoundPointerUpToAlignment(cir::CIRBaseBuilderTy &builder,
 mlir::Type useFirstFieldIfTransparentUnion(mlir::Type Ty);
 
 CIRCXXABI::RecordArgABI getRecordArgABI(const StructType RT, CIRCXXABI &CXXABI);
+CIRCXXABI::RecordArgABI getRecordArgABI(mlir::Type ty, CIRCXXABI &CXXABI);
 
 } // namespace cir
 
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/LoweringPrepareX86CXXABI.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/LoweringPrepareX86CXXABI.cpp
@@ -46,7 +46,6 @@ std::unique_ptr<cir::LowerModule> getLowerModule(cir::VAArgOp op) {
   mlir::ModuleOp mo = op->getParentOfType<mlir::ModuleOp>();
   if (!mo)
     return nullptr;
-
   mlir::PatternRewriter rewriter(mo.getContext());
   return cir::createLowerModule(mo, rewriter);
 }
@@ -92,7 +91,7 @@ mlir::Value LoweringPrepareX86CXXABI::lowerVAArgX86_64(
   // Let's hope LLVM's va_arg instruction can take care of it.
   // Remove this when X86_64ABIInfo::classify can take care of every type.
   if (!mlir::isa<VoidType, IntType, SingleType, DoubleType, BoolType,
-                 StructType>(op.getType()))
+                 StructType, LongDoubleType>(op.getType()))
     return nullptr;
 
   // Assume that va_list type is correct; should be pointer to LLVM type:
@@ -107,7 +106,6 @@ mlir::Value LoweringPrepareX86CXXABI::lowerVAArgX86_64(
   std::unique_ptr<cir::LowerModule> lowerModule = getLowerModule(op);
   if (!lowerModule)
     return nullptr;
-
   mlir::Type ty = op.getType();
 
   // FIXME: How should we access the X86AVXABILevel?
@@ -167,7 +165,6 @@ mlir::Value LoweringPrepareX86CXXABI::lowerVAArgX86_64(
   mlir::Block *contBlock = currentBlock->splitBlock(op);
   mlir::Block *inRegBlock = builder.createBlock(contBlock);
   mlir::Block *inMemBlock = builder.createBlock(contBlock);
-
   builder.setInsertionPointToEnd(currentBlock);
   builder.create<BrCondOp>(loc, inRegs, inRegBlock, inMemBlock);
 
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp
@@ -165,6 +165,21 @@ void X86_64ABIInfo::classify(mlir::Type Ty, uint64_t OffsetBase, Class &Lo,
       Current = Class::SSE;
       return;
 
+    } else if (mlir::isa<LongDoubleType>(Ty)) {
+      const llvm::fltSemantics *LDF =
+          &getContext().getTargetInfo().getLongDoubleFormat();
+      if (LDF == &llvm::APFloat::IEEEquad()) {
+        Lo = Class::SSE;
+        Hi = Class::SSEUp;
+      } else if (LDF == &llvm::APFloat::x87DoubleExtended()) {
+        Lo = Class::X87;
+        Hi = Class::X87Up;
+      } else if (LDF == &llvm::APFloat::IEEEdouble()) {
+        Current = Class::SSE;
+      } else {
+        llvm_unreachable("unexpected long double representation!");
+      }
+      return;
     } else if (mlir::isa<BoolType>(Ty)) {
       Current = Class::Integer;
     } else if (const auto RT = mlir::dyn_cast<StructType>(Ty)) {
@@ -267,6 +282,65 @@ void X86_64ABIInfo::classify(mlir::Type Ty, uint64_t OffsetBase, Class &Lo,
   cir_cconv_unreachable("NYI");
 }
 
+ABIArgInfo X86_64ABIInfo::getIndirectResult(mlir::Type ty,
+                                            unsigned freeIntRegs) const {
+  // If this is a scalar LLVM value then assume LLVM will pass it in the right
+  // place naturally.
+  //
+  // This assumption is optimistic, as there could be free registers available
+  // when we need to pass this argument in memory, and LLVM could try to pass
+  // the argument in the free register. This does not seem to happen currently,
+  // but this code would be much safer if we could mark the argument with
+  // 'onstack'. See PR12193.
+  if (!isAggregateTypeForABI(ty) /* && IsIllegalVectorType(Ty) &&*/
+      /*!Ty->isBitIntType()*/) {
+    // FIXME: Handling enum type?
+
+    return (isPromotableIntegerTypeForABI(ty) ? ABIArgInfo::getExtend(ty)
+                                              : ABIArgInfo::getDirect());
+  }
+
+  if (CIRCXXABI::RecordArgABI RAA = getRecordArgABI(ty, getCXXABI()))
+    return getNaturalAlignIndirect(ty, RAA == CIRCXXABI::RAA_DirectInMemory);
+
+  // Compute the byval alignment. We specify the alignment of the byval in all
+  // cases so that the mid-level optimizer knows the alignment of the byval.
+  unsigned align = std::max(getContext().getTypeAlign(ty) / 8, 8U);
+
+  // Attempt to avoid passing indirect results using byval when possible. This
+  // is important for good codegen.
+  //
+  // We do this by coercing the value into a scalar type which the backend can
+  // handle naturally (i.e., without using byval).
+  //
+  // For simplicity, we currently only do this when we have exhausted all of the
+  // free integer registers. Doing this when there are free integer registers
+  // would require more care, as we would have to ensure that the coerced value
+  // did not claim the unused register. That would require either reording the
+  // arguments to the function (so that any subsequent inreg values came first),
+  // or only doing this optimization when there were no following arguments that
+  // might be inreg.
+  //
+  // We currently expect it to be rare (particularly in well written code) for
+  // arguments to be passed on the stack when there are still free integer
+  // registers available (this would typically imply large structs being passed
+  // by value), so this seems like a fair tradeoff for now.
+  //
+  // We can revisit this if the backend grows support for 'onstack' parameter
+  // attributes. See PR12193.
+  if (freeIntRegs == 0) {
+    uint64_t size = getContext().getTypeSize(ty);
+
+    // If this type fits in an eightbyte, coerce it into the matching integral
+    // type, which will end up on the stack (with alignment 8).
+    if (align == 8 && size <= 64)
+      return ABIArgInfo::getDirect(
+          cir::IntType::get(LT.getMLIRContext(), size, false));
+  }
+
+  return ABIArgInfo::getIndirect(align);
+}
+
 /// Return a type that will be passed by the backend in the low 8 bytes of an
 /// XMM register, corresponding to the SSE class.
 mlir::Type X86_64ABIInfo::GetSSETypeAtOffset(mlir::Type IRType,
@@ -278,7 +352,7 @@ mlir::Type X86_64ABIInfo::GetSSETypeAtOffset(mlir::Type IRType,
       (unsigned)getContext().getTypeSize(SourceTy) / 8 - SourceOffset;
   mlir::Type T0 = getFPTypeAtOffset(IRType, IROffset, TD);
   if (!T0 || mlir::isa<mlir::Float64Type>(T0))
-    return T0; // NOTE(cir): Not sure if this is correct.
+    return cir::DoubleType::get(LT.getMLIRContext());
 
   mlir::Type T1 = {};
   unsigned T0Size = TD.getTypeAllocSize(T0);
@@ -296,6 +370,8 @@ mlir::Type X86_64ABIInfo::GetSSETypeAtOffset(mlir::Type IRType,
       return T0;
   }
 
+  return cir::DoubleType::get(LT.getMLIRContext());
+
   cir_cconv_unreachable("NYI");
 }
 
@@ -539,13 +615,34 @@ ABIArgInfo X86_64ABIInfo::classifyArgumentType(
     ++neededSSE;
     break;
   }
+  // AMD64-ABI 3.2.3p3: Rule 1. If the class is MEMORY, pass the argument
+  // on the stack.
+  case Class::Memory:
+
+  // AMD64-ABI 3.2.3p3: Rule 5. If the class is X87, X87UP or
+  // COMPLEX_X87, it is passed in memory.
+  case Class::X87:
+  case Class::ComplexX87:
+    if (getRecordArgABI(Ty, getCXXABI()) == CIRCXXABI::RAA_Indirect)
+      ++neededInt;
+    return getIndirectResult(Ty, freeIntRegs);
+
+  case Class::SSEUp:
+  case Class::X87Up:
+    llvm_unreachable("Invalid classification for lo word.");
+
   default:
     cir_cconv_assert_or_abort(!cir::MissingFeatures::X86ArgTypeClassification(),
                               "NYI");
   }
 
   mlir::Type HighPart = {};
   switch (Hi) {
+  case Class::Memory:
+  case Class::X87:
+  case Class::ComplexX87:
+    llvm_unreachable("Invalid classification for hi word.");
+
   case Class::NoClass:
     break;
 
@@ -558,8 +655,23 @@ ABIArgInfo X86_64ABIInfo::classifyArgumentType(
       return ABIArgInfo::getDirect(HighPart, 8);
     break;
 
-  default:
-    cir_cconv_unreachable("NYI");
+  // X87Up generally doesn't occur here (long double is passed in
+  // memory), except in situations involving unions.
+  case Class::X87Up:
+  case Class::SSE:
+    ++neededSSE;
+    HighPart = GetSSETypeAtOffset(Ty, 8, Ty, 8);
+
+    if (Lo == Class::NoClass) // Pass HighPart at offset 8 in memory.
+      return ABIArgInfo::getDirect(HighPart, 8);
+    break;
+
+  // AMD64-ABI 3.2.3p3: Rule 4. If the class is SSEUP, the
+  // eightbyte is passed in the upper half of the last used SSE
+  // register.  This only happens when 128-bit vectors are passed.
+  case Class::SSEUp:
+    llvm_unreachable("NYI && We need to implement GetByteVectorType");
+    break;
   }
 
   // If a high part was specified, merge it together with the low part.  It is
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86_64ABIInfo.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86_64ABIInfo.h
@@ -69,6 +69,14 @@ class X86_64ABIInfo : public cir::ABIInfo {
                                     mlir::Type SourceTy,
                                     unsigned SourceOffset) const;
 
+  /// getIndirectResult - Give a source type \arg Ty, return a suitable result
+  /// such that the argument will be passed in memory.
+  ///
+  /// \param freeIntRegs - The number of free integer registers remaining
+  /// available.
+  ::cir::ABIArgInfo getIndirectResult(mlir::Type ty,
+                                      unsigned freeIntRegs) const;
+
   /// The 0.98 ABI revision clarified a lot of ambiguities,
   /// unfortunately in ways that were not always consistent with
   /// certain previous compilers.  In particular, platforms which
diff --git a/clang/test/CIR/Lowering/var-arg-x86_64.c b/clang/test/CIR/Lowering/var-arg-x86_64.c
@@ -76,3 +76,55 @@ double f1(int n, ...) {
 // CIR: [[CASTED_ARG_P:%.+]] = cir.cast(bitcast, [[ARG]]
 // CIR: [[CASTED_ARG:%.+]] = cir.load align(16) [[CASTED_ARG_P]]
 // CIR: store [[CASTED_ARG]], [[RES]]
+long double f2(int n, ...) {
+  va_list valist;
+  va_start(valist, n);
+  long double res = va_arg(valist, long double);
+  va_end(valist);
+  return res;
+}
+
+// CHECK: define {{.*}}@f2
+// CHECK: [[RESULT:%.+]] = alloca x86_fp80
+// CHECK: [[VA_LIST_ALLOCA:%.+]] = alloca {{.*}}[[VA_LIST_TYPE]]
+// CHECK: [[RES:%.+]] = alloca x86_fp80
+// CHECK: [[VA_LIST:%.+]] = getelementptr {{.*}} [[VA_LIST_ALLOCA]], i32 0
+// CHECK: call {{.*}}@llvm.va_start.p0(ptr [[VA_LIST]])
+// CHECK: [[VA_LIST2:%.+]] = getelementptr {{.*}} [[VA_LIST_ALLOCA]], i32 0
+// CHECK: [[OVERFLOW_AREA_P:%.+]] = getelementptr {{.*}} [[VA_LIST2]], i32 0, i32 2
+// CHECK: [[OVERFLOW_AREA:%.+]] = load ptr, ptr [[OVERFLOW_AREA_P]]
+// Ptr Mask Operations
+// CHECK: [[OVERFLOW_AREA_OFFSET_ALIGNED:%.+]] = getelementptr i8, ptr [[OVERFLOW_AREA]], i64 15
+// CHECK: [[OVERFLOW_AREA_OFFSET_ALIGNED_P:%.+]] = ptrtoint ptr [[OVERFLOW_AREA_OFFSET_ALIGNED]] to i32
+// CHECK: [[MASKED:%.+]] = and i32 [[OVERFLOW_AREA_OFFSET_ALIGNED_P]], -16
+// CHECK: [[DIFF:%.+]] = sub i32 [[OVERFLOW_AREA_OFFSET_ALIGNED_P]], [[MASKED]]
+// CHECK: [[PTR_MASKED:%.+]] = getelementptr i8, ptr [[OVERFLOW_AREA_OFFSET_ALIGNED]], i32 [[DIFF]]
+// CHECK: [[OVERFLOW_AREA_NEXT:%.+]] = getelementptr i8, ptr [[PTR_MASKED]], i64 16
+// CHECK: store ptr [[OVERFLOW_AREA_NEXT]], ptr [[OVERFLOW_AREA_P]]
+// CHECK: [[VALUE:%.+]] = load x86_fp80, ptr [[PTR_MASKED]]
+// CHECK: store x86_fp80 [[VALUE]], ptr [[RES]]
+// CHECK: [[VA_LIST2:%.+]] = getelementptr {{.*}} [[VA_LIST_ALLOCA]], i32 0
+// CHECK: call {{.*}}@llvm.va_end.p0(ptr [[VA_LIST2]])
+// CHECK: [[VALUE2:%.+]] = load x86_fp80, ptr [[RES]]
+// CHECK: store x86_fp80 [[VALUE2]], ptr [[RESULT]]
+// CHECK: [[RETURN_VALUE:%.+]] = load x86_fp80, ptr [[RESULT]]
+// CHECK: ret x86_fp80 [[RETURN_VALUE]]
+
+// CIR: cir.func @f2
+// CIR: [[VA_LIST_ALLOCA:%.+]] = cir.alloca !cir.array<!ty___va_list_tag x 1>, !cir.ptr<!cir.array<!ty___va_list_tag x 1>>, ["valist"]
+// CIR: [[RES:%.+]] = cir.alloca !cir.long_double<!cir.f80>, !cir.ptr<!cir.long_double<!cir.f80>>, ["res"
+// CIR: [[VASTED_VA_LIST:%.+]] = cir.cast(array_to_ptrdecay, [[VA_LIST_ALLOCA]] 
+// CIR: cir.va.start [[VASTED_VA_LIST]]
+// CIR: [[VASTED_VA_LIST:%.+]] = cir.cast(array_to_ptrdecay, [[VA_LIST_ALLOCA]] 
+// CIR: [[OVERFLOW_AREA_P:%.+]] = cir.get_member [[VASTED_VA_LIST]][2] {name = "overflow_arg_area"}
+// CIR-DAG: [[OVERFLOW_AREA:%.+]] = cir.load [[OVERFLOW_AREA_P]]
+// CIR-DAG: [[CASTED:%.+]] = cir.cast(bitcast, [[OVERFLOW_AREA]] : !cir.ptr<!void>)
+// CIR-DAG: [[CONSTANT:%.+]] = cir.const #cir.int<15>
+// CIR-DAG: [[PTR_STRIDE:%.+]] = cir.ptr_stride([[CASTED]] {{.*}}[[CONSTANT]]
+// CIR-DAG: [[MINUS_ALIGN:%.+]] = cir.const #cir.int<-16>
+// CIR-DAG: [[ALIGNED:%.+]] = cir.ptr_mask([[PTR_STRIDE]], [[MINUS_ALIGN]]
+// CIR: [[ALIGN:%.+]] = cir.const #cir.int<16>
+// CIR: [[CAST_ALIGNED:%.+]] = cir.cast(bitcast, [[ALIGNED]] : !cir.ptr<!u8i>), !cir.ptr<!cir.long_double<!cir.f80>>
+// CIR: [[CAST_ALIGNED_VALUE:%.+]] = cir.load [[CAST_ALIGNED]]
+// CIR: cir.store [[CAST_ALIGNED_VALUE]], [[RES]]
+// CIR. cir.via.end

Original file line number	Diff line number	Diff line change
`@@ -745,7 +745,7 @@ const llvm::fltSemantics &FP80Type::getFloatSemantics() const {`
`745`	`745`	`llvm::TypeSize`
`746`	`746`	`FP80Type::getTypeSizeInBits(const mlir::DataLayout &dataLayout,`
`747`	`747`	`mlir::DataLayoutEntryListRef params) const {`
`748`		`- return llvm::TypeSize::getFixed(16);`
	`748`	`+ return llvm::TypeSize::getFixed(128);`
`749`	`749`	`}`
`750`	`750`
`751`	`751`	`uint64_t FP80Type::getABIAlignment(const mlir::DataLayout &dataLayout,`
`@@ -766,6 +766,7 @@ const llvm::fltSemantics &FP128Type::getFloatSemantics() const {`
`766`	`766`	`llvm::TypeSize`
`767`	`767`	`FP128Type::getTypeSizeInBits(const mlir::DataLayout &dataLayout,`
`768`	`768`	`mlir::DataLayoutEntryListRef params) const {`
	`769`	`+ // FIXME: We probably want it to return 128. But we're lacking a test now.`
`769`	`770`	`return llvm::TypeSize::getFixed(16);`
`770`	`771`	`}`
`771`	`772`