diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerFunction.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerFunction.cpp
index dc9bf64ba3d6..f318ea7b1fdc 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerFunction.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerFunction.cpp
@@ -1388,8 +1388,14 @@ cir::TypeEvaluationKind LowerFunction::getEvaluationKind(mlir::Type type) {
   // FIXME(cir): Implement type classes for CIR types.
   if (mlir::isa<RecordType>(type))
     return cir::TypeEvaluationKind::TEK_Aggregate;
+
+  if (mlir::isa<ComplexType>(type))
+    return cir::TypeEvaluationKind::TEK_Complex;
+
+  // Scalar types
   if (mlir::isa<BoolType, IntType, SingleType, DoubleType, LongDoubleType,
-                VectorType, PointerType>(type))
+                VectorType, PointerType, VoidType, FP16Type, FP80Type,
+                FP128Type, BF16Type>(type))
     return cir::TypeEvaluationKind::TEK_Scalar;
   cir_cconv_unreachable("NYI");
 }
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/AMDGPU.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/AMDGPU.cpp
index 76d457f1607b..b333e7bcb53b 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/AMDGPU.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/AMDGPU.cpp
@@ -11,10 +11,14 @@
 #include "LowerTypes.h"
 #include "TargetInfo.h"
 #include "TargetLoweringInfo.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "clang/Basic/AddressSpaces.h"
 #include "clang/CIR/ABIArgInfo.h"
+#include "clang/CIR/Dialect/IR/CIRAttrs.h"
 #include "clang/CIR/Dialect/IR/CIROpsEnums.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/MissingFeatures.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -34,9 +38,19 @@ class AMDGPUABIInfo : public ABIInfo {
   AMDGPUABIInfo(LowerTypes &lt) : ABIInfo(lt) {}
 
 private:
-  void computeInfo(LowerFunctionInfo &fi) const override {
-    llvm_unreachable("NYI");
-  }
+  static const unsigned maxNumRegsForArgsRet = 16;
+
+  unsigned numRegsForType(mlir::Type ty) const;
+
+  // Coerce HIP scalar pointer arguments from generic pointers to global ones.
+  mlir::Type coerceKernelArgumentType(mlir::Type ty, unsigned fromAS,
+                                      unsigned toAS) const;
+
+  ABIArgInfo classifyReturnType(mlir::Type ty) const;
+  ABIArgInfo classifyArgumentType(mlir::Type ty, bool variadic,
+                                  unsigned &numRegsLeft) const;
+  ABIArgInfo classifyKernelArgumentType(mlir::Type ty) const;
+  void computeInfo(LowerFunctionInfo &fi) const override;
 };
 
 class AMDGPUTargetLoweringInfo : public TargetLoweringInfo {
@@ -63,7 +77,129 @@ class AMDGPUTargetLoweringInfo : public TargetLoweringInfo {
   }
 };
 
+// Estimate the number of registers the type will use
+unsigned AMDGPUABIInfo::numRegsForType(mlir::Type ty) const {
+  if (isAggregateTypeForABI(ty)) {
+    llvm_unreachable("numRegsForType for aggregate types is NYI for AMDGPU");
+  }
+
+  uint64_t size = getContext().getTypeSize(ty);
+  return (size + 31) / 32;
+}
+
+// Coerce HIP scalar pointer arguments from generic pointers to global ones.
+mlir::Type AMDGPUABIInfo::coerceKernelArgumentType(mlir::Type ty,
+                                                   unsigned fromAS,
+                                                   unsigned toAS) const {
+  if (auto ptrTy = mlir::dyn_cast<cir::PointerType>(ty)) {
+    mlir::Attribute addrSpaceAttr = ptrTy.getAddrSpace();
+    unsigned currentAS = 0;
+    // Get the current address space.
+    if (auto targetAS = mlir::dyn_cast_if_present<cir::TargetAddressSpaceAttr>(
+            addrSpaceAttr))
+      currentAS = targetAS.getValue();
+    // If currentAS is same as the FromAS, coerce it to the ToAS.
+    if (currentAS == fromAS) {
+      auto newAddrSpaceAttr =
+          cir::TargetAddressSpaceAttr::get(ty.getContext(), toAS);
+      return cir::PointerType::get(ptrTy.getPointee(), newAddrSpaceAttr);
+    }
+  }
+  return ty;
+}
+
+ABIArgInfo AMDGPUABIInfo::classifyReturnType(mlir::Type ty) const {
+  if (isAggregateTypeForABI(ty)) {
+    llvm_unreachable(
+        "classifyReturnType for aggregate types is NYI for AMDGPU");
+  }
+
+  return isPromotableIntegerTypeForABI(ty) ? ABIArgInfo::getExtend(ty)
+                                           : ABIArgInfo::getDirect();
+}
+
+ABIArgInfo AMDGPUABIInfo::classifyArgumentType(mlir::Type ty, bool variadic,
+                                               unsigned &numRegsLeft) const {
+  assert(numRegsLeft <= maxNumRegsForArgsRet && "register estimate underflow");
+
+  ty = useFirstFieldIfTransparentUnion(ty);
+
+  if (isAggregateTypeForABI(ty)) {
+    llvm_unreachable(
+        "classifyArgumentType for aggregate types is NYI for AMDGPU");
+  }
+
+  if (variadic) {
+    return ABIArgInfo::getDirect(nullptr, 0, nullptr, false, 0);
+  }
+
+  ABIArgInfo argInfo =
+      (isPromotableIntegerTypeForABI(ty) ? ABIArgInfo::getExtend(ty)
+                                         : ABIArgInfo::getDirect());
+
+  // Track register usage
+  if (!argInfo.isIndirect()) {
+    unsigned numRegs = numRegsForType(ty);
+    numRegsLeft -= std::min(numRegs, numRegsLeft);
+  }
+
+  return argInfo;
+}
+
+ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(mlir::Type ty) const {
+  ty = useFirstFieldIfTransparentUnion(ty);
+
+  // Aggregate types are not yet supported
+  if (isAggregateTypeForABI(ty)) {
+    llvm_unreachable("Aggregate types NYI for AMDGPU kernel arguments");
+  }
+
+  mlir::Type origTy = ty;
+  mlir::Type coercedTy = origTy;
+
+  // Determine if the target is in HIP, based on the triple.
+  // TODO: use getLangOpts().HIP instead.
+  const auto &Triple = getTarget().getTriple();
+  bool isHIP = Triple.getArch() == llvm::Triple::amdgcn &&
+               Triple.getOS() == llvm::Triple::AMDHSA;
+
+  // For HIP, coerce pointer arguments from generic to global
+  if (isHIP) {
+    unsigned genericAS =
+        getTarget().getTargetAddressSpace(clang::LangAS::Default);
+    unsigned globalAS =
+        getTarget().getTargetAddressSpace(clang::LangAS::cuda_device);
+    coercedTy = coerceKernelArgumentType(origTy, genericAS, globalAS);
+  }
+
+  // If we set CanBeFlattened to true, CodeGen will expand the struct to its
+  // individual elements, which confuses the Clover OpenCL backend; therefore we
+  // have to set it to false here. Other args of getDirect() are just defaults.
+  return ABIArgInfo::getDirect(coercedTy, 0, nullptr, false);
+}
+
+void AMDGPUABIInfo::computeInfo(LowerFunctionInfo &fi) const {
+  const unsigned cc = fi.getCallingConvention();
+
+  if (!getCXXABI().classifyReturnType(fi))
+    fi.getReturnInfo() = classifyReturnType(fi.getReturnType());
+
+  unsigned argumentIndex = 0;
+  const unsigned numFixedArguments = fi.getNumRequiredArgs();
+
+  unsigned numRegsLeft = maxNumRegsForArgsRet;
+  for (auto &arg : fi.arguments()) {
+    if (cc == llvm::CallingConv::AMDGPU_KERNEL) {
+      arg.info = classifyKernelArgumentType(arg.type);
+    } else {
+      bool fixedArgument = argumentIndex++ < numFixedArguments;
+      arg.info = classifyArgumentType(arg.type, !fixedArgument, numRegsLeft);
+    }
+  }
+}
+
 } // namespace
+
 std::unique_ptr<TargetLoweringInfo>
 createAMDGPUTargetLoweringInfo(LowerModule &lowerModule) {
   return std::make_unique<AMDGPUTargetLoweringInfo>(lowerModule.getTypes());
diff --git a/clang/test/CIR/CodeGen/HIP/amdgpu-hip-kernel-abi.hip b/clang/test/CIR/CodeGen/HIP/amdgpu-hip-kernel-abi.hip
index 014b9da2b330..c429907dc076 100644
--- a/clang/test/CIR/CodeGen/HIP/amdgpu-hip-kernel-abi.hip
+++ b/clang/test/CIR/CodeGen/HIP/amdgpu-hip-kernel-abi.hip
@@ -2,27 +2,73 @@
 
 // RUN: %clang_cc1 -triple=amdgcn-amd-amdhsa -x hip -fclangir \
 // RUN:            -fcuda-is-device -fhip-new-launch-api \
-// RUN:            -I%S/../../Inputs/ -emit-cir %s -o %t.cir
-// RUN: FileCheck --input-file=%t.cir %s
+// RUN:            -I%S/../Inputs/ -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -triple=amdgcn-amd-amdhsa -x hip -fclangir \
+// RUN:            -fcuda-is-device -fhip-new-launch-api \
+// RUN:            -I%S/../Inputs/ -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -triple=amdgcn-amd-amdhsa -x hip \
+// RUN:            -fcuda-is-device -fhip-new-launch-api \
+// RUN:            -I%S/../Inputs/ -emit-llvm %s -o %t.ogcg.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ogcg.ll %s
+
+//===----------------------------------------------------------------------===//
+// Test ABI lowering from CIR to LLVM IR for AMDGPU
+//===----------------------------------------------------------------------===//
 
 // Test simple kernel
-// CHECK: cir.func{{.*}} @_Z13simple_kerneli(%arg{{[0-9]+}}: !s32i{{.*}}) cc(amdgpu_kernel)
-__global__ void simple_kernel(int x) {}
+// CIR: cir.func{{.*}} @_Z13simple_kernelv{{.*}} cc(amdgpu_kernel)
+// LLVM: define{{.*}} amdgpu_kernel void @_Z13simple_kernelv()
+// OGCG: define{{.*}} amdgpu_kernel void @_Z13simple_kernelv()
+__global__ void simple_kernel() {}
+
+// Test kernel with int argument
+// CIR: cir.func{{.*}} @_Z14kernel_int_argi(%arg{{[0-9]+}}: !s32i{{.*}}) cc(amdgpu_kernel)
+// LLVM: define{{.*}} amdgpu_kernel void @_Z14kernel_int_argi(i32 %{{.*}})
+// OGCG: define{{.*}} amdgpu_kernel void @_Z14kernel_int_argi(i32{{.*}} %{{.*}})
+__global__ void kernel_int_arg(int x) {}
+
+// Test kernel with char argument
+// CIR: cir.func{{.*}} @_Z15kernel_char_argc(%arg{{[0-9]+}}: !s8i{{.*}}) cc(amdgpu_kernel)
+// LLVM: define{{.*}} amdgpu_kernel void @_Z15kernel_char_argc(i8 %{{.*}})
+// OGCG: define{{.*}} amdgpu_kernel void @_Z15kernel_char_argc(i8{{.*}} %{{.*}})
+__global__ void kernel_char_arg(char c) {}
 
-// Test kernel with pointer
-// CHECK: cir.func{{.*}} @_Z15kernel_with_ptrPi(%arg{{[0-9]+}}: !cir.ptr<!s32i{{.*}}>{{.*}}) cc(amdgpu_kernel)
-__global__ void kernel_with_ptr(int *ptr) {}
+// Test kernel with pointer (HIP coerces generic pointers to global addrspace 1)
+// CIR: cir.func{{.*}} @_Z14kernel_ptr_argPi(%arg{{[0-9]+}}: !cir.ptr<!s32i{{.*}}>{{.*}}) cc(amdgpu_kernel)
+// LLVM: define{{.*}} amdgpu_kernel void @_Z14kernel_ptr_argPi(ptr addrspace(1) %{{.*}})
+// OGCG: define{{.*}} amdgpu_kernel void @_Z14kernel_ptr_argPi(ptr addrspace(1){{.*}} %{{.*}})
+__global__ void kernel_ptr_arg(int *ptr) {}
 
-// Test kernel with multiple args
-// CHECK: cir.func{{.*}} @_Z16kernel_multi_argifdPi(%arg{{[0-9]+}}: !s32i{{.*}}, %arg{{[0-9]+}}: !cir.float{{.*}}, %arg{{[0-9]+}}: !cir.double{{.*}}, %arg{{[0-9]+}}: !cir.ptr<!s32i{{.*}}>{{.*}}) cc(amdgpu_kernel)
-__global__ void kernel_multi_arg(int a, float b, double c, int *d) {}
+// Test kernel with multiple args (pointer coerced to global addrspace 1)
+// CIR: cir.func{{.*}} @_Z16kernel_multi_argifPf(%arg{{[0-9]+}}: !s32i{{.*}} !cir.float{{.*}} !cir.ptr<!cir.float{{.*}}>{{.*}}) cc(amdgpu_kernel)
+// LLVM: define{{.*}} amdgpu_kernel void @_Z16kernel_multi_argifPf(i32 %{{.*}}, float %{{.*}}, ptr addrspace(1) %{{.*}})
+// OGCG: define{{.*}} amdgpu_kernel void @_Z16kernel_multi_argifPf(i32{{.*}} %{{.*}}, float{{.*}} %{{.*}}, ptr addrspace(1){{.*}} %{{.*}})
+__global__ void kernel_multi_arg(int a, float b, float *c) {}
 
-// Test device function (NOT a kernel)
-// CHECK: cir.func{{.*}} @_Z9device_fni(%arg{{[0-9]+}}: !s32i{{.*}})
-// CHECK-NOT: cc(amdgpu_kernel)
+// Test device function
+// CIR: cir.func{{.*}} @_Z9device_fni(%arg{{[0-9]+}}: !s32i{{.*}})
+// LLVM: define{{.*}} void @_Z9device_fni(i32 %{{.*}})
+// OGCG: define{{.*}} void @_Z9device_fni(i32{{.*}} %{{.*}})
 __device__ void device_fn(int x) {}
 
 // Test device function with return value
-// CHECK: cir.func{{.*}} @_Z13device_fn_reti(%arg{{[0-9]+}}: !s32i{{.*}}) -> !s32i
-// CHECK-NOT: cc(amdgpu_kernel)
-__device__ int device_fn_ret(int x) { return x + 1; }
+// CIR: cir.func{{.*}} @_Z15device_fn_floatf(%arg{{[0-9]+}}: !cir.float{{.*}}) -> !cir.float
+// LLVM: define{{.*}} float @_Z15device_fn_floatf(float %{{.*}})
+// OGCG: define{{.*}} float @_Z15device_fn_floatf(float{{.*}} %{{.*}})
+__device__ float device_fn_float(float f) { return f * 2.0f; }
+
+// Test kernel with pointer (coerced to global addrspace 1)
+// CIR: cir.func{{.*}} @_Z17kernel_shared_ptrPi(%arg{{[0-9]+}}: !cir.ptr<!s32i{{.*}}>{{.*}}) cc(amdgpu_kernel)
+// LLVM: define{{.*}} amdgpu_kernel void @_Z17kernel_shared_ptrPi(ptr addrspace(1) %{{.*}})
+// OGCG: define{{.*}} amdgpu_kernel void @_Z17kernel_shared_ptrPi(ptr addrspace(1){{.*}} %{{.*}})
+__global__ void kernel_shared_ptr(int *ptr) {}
+
+// Test variadic device function
+// CIR: cir.func{{.*}} @_Z11variadic_fniz(%arg{{[0-9]+}}: !s32i{{.*}}, ...)
+// LLVM: define{{.*}} void @_Z11variadic_fniz(i32 %{{.*}}, ...)
+// OGCG: define{{.*}} void @_Z11variadic_fniz(i32{{.*}} %{{.*}}, ...)
+__device__ void variadic_fn(int count, ...) {}
diff --git a/clang/test/CIR/CodeGen/HIP/amdgpu-kernel-calling-conv.cl b/clang/test/CIR/CodeGen/HIP/amdgpu-kernel-calling-conv.cl
deleted file mode 100644
index 6c94b1921422..000000000000
--- a/clang/test/CIR/CodeGen/HIP/amdgpu-kernel-calling-conv.cl
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: %clang_cc1 %s -fclangir -triple amdgcn-amd-amdhsa -emit-cir -o %t.cir
-// RUN: FileCheck --input-file=%t.cir %s
-
-// Test kernel function with amdgpu_kernel calling convention
-// CHECK: cir.func{{.*}} @simple_kernel() cc(amdgpu_kernel)
-__kernel void simple_kernel() {}
-
-// Test kernel with simple integer argument
-// CHECK: cir.func{{.*}} @kernel_with_int(%arg{{[0-9]+}}: !s32i{{.*}}) cc(amdgpu_kernel)
-__kernel void kernel_with_int(int x) {}
-
-// Test kernel with pointer argument (should be in global address space)
-// CHECK: cir.func{{.*}} @kernel_with_ptr(%arg{{[0-9]+}}: !cir.ptr<!s32i, lang_address_space(offload_global)>{{.*}}) cc(amdgpu_kernel)
-__kernel void kernel_with_ptr(global int *ptr) {}
-
-// Test device function (should NOT have amdgpu_kernel calling convention)
-// CHECK: cir.func{{.*}} @device_fn
-// CHECK-NOT: cc(amdgpu_kernel)
-void device_fn(int x) {}
diff --git a/clang/test/CIR/CodeGen/OpenCL/amdgpu-kernel-abi.cl b/clang/test/CIR/CodeGen/OpenCL/amdgpu-kernel-abi.cl
new file mode 100644
index 000000000000..aee39944e4f8
--- /dev/null
+++ b/clang/test/CIR/CodeGen/OpenCL/amdgpu-kernel-abi.cl
@@ -0,0 +1,66 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fclangir \
+// RUN:            -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fclangir \
+// RUN:            -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
+// RUN:            -emit-llvm %s -o %t.ogcg.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ogcg.ll %s
+
+//===----------------------------------------------------------------------===//
+// Test ABI lowering from CIR to LLVM IR for AMDGPU OpenCL kernels
+//===----------------------------------------------------------------------===//
+
+// Test simple kernel
+// CIR: cir.func{{.*}} @simple_kernel{{.*}} cc(amdgpu_kernel)
+// LLVM: define{{.*}} amdgpu_kernel void @simple_kernel()
+// OGCG: define{{.*}} amdgpu_kernel void @simple_kernel()
+__kernel void simple_kernel() {}
+
+// Test kernel with int argument
+// CIR: cir.func{{.*}} @kernel_with_int(%arg{{[0-9]+}}: !s32i{{.*}}) cc(amdgpu_kernel)
+// LLVM: define{{.*}} amdgpu_kernel void @kernel_with_int(i32 %{{.*}})
+// OGCG: define{{.*}} amdgpu_kernel void @kernel_with_int(i32{{.*}} %{{.*}})
+__kernel void kernel_with_int(int x) {}
+
+// Test kernel with pointer argument
+// CIR: cir.func{{.*}} @kernel_with_ptr(%arg{{[0-9]+}}: !cir.ptr<!s32i, lang_address_space(offload_global)>{{.*}}) cc(amdgpu_kernel)
+// LLVM: define{{.*}} amdgpu_kernel void @kernel_with_ptr(ptr addrspace(1){{.*}}%{{.*}})
+// OGCG: define{{.*}} amdgpu_kernel void @kernel_with_ptr(ptr addrspace(1){{.*}} %{{.*}})
+__kernel void kernel_with_ptr(global int *ptr) {}
+
+// Test kernel with multiple args
+// CIR: cir.func{{.*}} @kernel_multi_arg(%arg{{[0-9]+}}: !s32i{{.*}}, %arg{{[0-9]+}}: !cir.float{{.*}}, %arg{{[0-9]+}}: !cir.ptr<!cir.float, lang_address_space(offload_global)>{{.*}}) cc(amdgpu_kernel)
+// LLVM: define{{.*}} amdgpu_kernel void @kernel_multi_arg(i32 %{{.*}}, float %{{.*}}, ptr addrspace(1){{.*}}%{{.*}})
+// OGCG: define{{.*}} amdgpu_kernel void @kernel_multi_arg(i32{{.*}} %{{.*}}, float{{.*}} %{{.*}}, ptr addrspace(1){{.*}} %{{.*}})
+__kernel void kernel_multi_arg(int a, float b, global float *c) {}
+
+// Test device function
+// CIR: cir.func{{.*}} @device_fn(%arg{{[0-9]+}}: !s32i{{.*}})
+// CIR-NOT: cc(amdgpu_kernel)
+// LLVM: define{{.*}} void @device_fn(i32 %{{.*}})
+// LLVM-NOT: amdgpu_kernel
+// OGCG: define{{.*}} void @device_fn(i32{{.*}} %{{.*}})
+// OGCG-NOT: amdgpu_kernel
+void device_fn(int x) {}
+
+// Test device function with return value
+// CIR: cir.func{{.*}} @device_fn_float(%arg{{[0-9]+}}: !cir.float{{.*}}) -> !cir.float
+// LLVM: define{{.*}} float @device_fn_float(float %{{.*}})
+// OGCG: define{{.*}} float @device_fn_float(float{{.*}} %{{.*}})
+float device_fn_float(float f) { return f * 2.0f; }
+
+// Test kernel with local address space pointer (addrspace 3)
+// CIR: cir.func{{.*}} @kernel_local_ptr(%arg{{[0-9]+}}: !cir.ptr<!s32i, lang_address_space(offload_local)>{{.*}}) cc(amdgpu_kernel)
+// LLVM: define{{.*}} amdgpu_kernel void @kernel_local_ptr(ptr addrspace(3){{.*}}%{{.*}})
+// OGCG: define{{.*}} amdgpu_kernel void @kernel_local_ptr(ptr addrspace(3){{.*}} %{{.*}})
+__kernel void kernel_local_ptr(local int *ptr) {}
+
+// Test kernel with constant address space pointer (addrspace 4)
+// CIR: cir.func{{.*}} @kernel_constant_ptr(%arg{{[0-9]+}}: !cir.ptr<!s32i, lang_address_space(offload_constant)>{{.*}}) cc(amdgpu_kernel)
+// LLVM: define{{.*}} amdgpu_kernel void @kernel_constant_ptr(ptr addrspace(4){{.*}}%{{.*}})
+// OGCG: define{{.*}} amdgpu_kernel void @kernel_constant_ptr(ptr addrspace(4){{.*}} %{{.*}})
+__kernel void kernel_constant_ptr(constant int *ptr) {}