[CIR][AMDGPU] Add lowering for amdgcn rsq builtins

skc7 · skc7 · commit 028cb8a610c3 · 2025-12-23T16:06:13.000+05:30
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp
@@ -332,11 +332,13 @@ mlir::Value CIRGenFunction::emitAMDGPUBuiltinExpr(unsigned builtinId,
   case AMDGPU::BI__builtin_amdgcn_rsqf:
   case AMDGPU::BI__builtin_amdgcn_rsqh:
   case AMDGPU::BI__builtin_amdgcn_rsq_bf16: {
-    llvm_unreachable("rsq_* NYI");
+    return emitBuiltinWithOneOverloadedType<1>(expr, "amdgcn.rsq")
+        .getScalarVal();
   }
   case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
   case AMDGPU::BI__builtin_amdgcn_rsq_clampf: {
-    llvm_unreachable("rsq_clamp_* NYI");
+    return emitBuiltinWithOneOverloadedType<1>(expr, "amdgcn.rsq.clamp")
+        .getScalarVal();
   }
   case AMDGPU::BI__builtin_amdgcn_sinf:
   case AMDGPU::BI__builtin_amdgcn_sinh:
diff --git a/clang/test/CIR/CodeGen/HIP/builtins-amdgcn-gfx1250.hip b/clang/test/CIR/CodeGen/HIP/builtins-amdgcn-gfx1250.hip
@@ -27,3 +27,14 @@ __device__ void test_rcp_bf16(__bf16* out, __bf16 a)
 {
   *out = __builtin_amdgcn_rcp_bf16(a);
 }
+
+// CIR-LABEL: @_Z13test_rsq_bf16PDF16bDF16b
+// CIR: cir.llvm.intrinsic "amdgcn.rsq" {{.*}} : (!cir.bf16) -> !cir.bf16
+// LLVM: define{{.*}} void @_Z13test_rsq_bf16PDF16bDF16b
+// LLVM: call{{.*}} bfloat @llvm.amdgcn.rsq.bf16(bfloat %{{.*}})
+// OGCG: define{{.*}} void @_Z13test_rsq_bf16PDF16bDF16b
+// OGCG: call{{.*}} bfloat @llvm.amdgcn.rsq.bf16(bfloat %{{.*}})
+__device__ void test_rsq_bf16(__bf16* out, __bf16 a)
+{
+  *out = __builtin_amdgcn_rsq_bf16(a);
+}
diff --git a/clang/test/CIR/CodeGen/HIP/builtins-amdgcn-vi.hip b/clang/test/CIR/CodeGen/HIP/builtins-amdgcn-vi.hip
@@ -76,3 +76,14 @@ __device__ void test_rcp_f16(_Float16* out, _Float16 a)
 {
   *out = __builtin_amdgcn_rcph(a);
 }
+
+// CIR-LABEL: @_Z10test_rsq_hPDF16_DF16_
+// CIR: cir.llvm.intrinsic "amdgcn.rsq" {{.*}} : (!cir.f16) -> !cir.f16
+// LLVM: define{{.*}} void @_Z10test_rsq_hPDF16_DF16_
+// LLVM: call{{.*}} half @llvm.amdgcn.rsq.f16(half %{{.*}})
+// OGCG: define{{.*}} void @_Z10test_rsq_hPDF16_DF16_
+// OGCG: call{{.*}} half @llvm.amdgcn.rsq.f16(half %{{.*}})
+__device__ void test_rsq_h(_Float16* out, _Float16 a)
+{
+  *out = __builtin_amdgcn_rsqh(a);
+}
diff --git a/clang/test/CIR/CodeGen/HIP/builtins-amdgcn.hip b/clang/test/CIR/CodeGen/HIP/builtins-amdgcn.hip
@@ -365,3 +365,45 @@ __device__ void test_rcp_f32(float* out, float a) {
 __device__ void test_rcp_f64(double* out, double a) {
   *out = __builtin_amdgcn_rcp(a);
 }
+
+// CIR-LABEL: @_Z12test_rsq_f32Pff
+// CIR: cir.llvm.intrinsic "amdgcn.rsq" {{.*}} : (!cir.float) -> !cir.float
+// LLVM: define{{.*}} void @_Z12test_rsq_f32Pff
+// LLVM: call{{.*}} float @llvm.amdgcn.rsq.f32(float %{{.*}})
+// OGCG: define{{.*}} void @_Z12test_rsq_f32Pff
+// OGCG: call{{.*}} float @llvm.amdgcn.rsq.f32(float %{{.*}})
+__device__ void test_rsq_f32(float* out, float a)
+{
+  *out = __builtin_amdgcn_rsqf(a);
+}
+
+// CIR-LABEL: @_Z12test_rsq_f64Pdd
+// CIR: cir.llvm.intrinsic "amdgcn.rsq" {{.*}} : (!cir.double) -> !cir.double
+// LLVM: define{{.*}} void @_Z12test_rsq_f64Pdd
+// LLVM: call{{.*}} double @llvm.amdgcn.rsq.f64(double %{{.*}})
+// OGCG: define{{.*}} void @_Z12test_rsq_f64Pdd
+// OGCG: call{{.*}} double @llvm.amdgcn.rsq.f64(double %{{.*}})
+__device__ void test_rsq_f64(double* out, double a) {
+  *out = __builtin_amdgcn_rsq(a);
+}
+
+// CIR-LABEL: @_Z18test_rsq_clamp_f32Pff
+// CIR: cir.llvm.intrinsic "amdgcn.rsq.clamp" {{.*}} : (!cir.float) -> !cir.float
+// LLVM: define{{.*}} void @_Z18test_rsq_clamp_f32Pff
+// LLVM: call{{.*}} float @llvm.amdgcn.rsq.clamp.f32(float %{{.*}})
+// OGCG: define{{.*}} void @_Z18test_rsq_clamp_f32Pff
+// OGCG: call{{.*}} float @llvm.amdgcn.rsq.clamp.f32(float %{{.*}})
+__device__ void test_rsq_clamp_f32(float* out, float a)
+{
+  *out = __builtin_amdgcn_rsq_clampf(a);
+}
+
+// CIR-LABEL: @_Z18test_rsq_clamp_f64Pdd
+// CIR: cir.llvm.intrinsic "amdgcn.rsq.clamp" {{.*}} : (!cir.double) -> !cir.double
+// LLVM: define{{.*}} void @_Z18test_rsq_clamp_f64Pdd
+// LLVM: call{{.*}} double @llvm.amdgcn.rsq.clamp.f64(double %{{.*}})
+// OGCG: define{{.*}} void @_Z18test_rsq_clamp_f64Pdd
+// OGCG: call{{.*}} double @llvm.amdgcn.rsq.clamp.f64(double %{{.*}})
+__device__ void test_rsq_clamp_f64(double* out, double a) {
+  *out = __builtin_amdgcn_rsq_clamp(a);
+}
diff --git a/clang/test/CIR/CodeGen/OpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CIR/CodeGen/OpenCL/builtins-amdgcn-gfx1250.cl
@@ -27,3 +27,14 @@ void test_rcp_bf16(global __bf16* out, __bf16 a)
 {
   *out = __builtin_amdgcn_rcp_bf16(a);
 }
+
+// CIR-LABEL: @test_rsq_bf16
+// CIR: cir.llvm.intrinsic "amdgcn.rsq" {{.*}} : (!cir.bf16) -> !cir.bf16
+// LLVM: define{{.*}} void @test_rsq_bf16
+// LLVM: call{{.*}} bfloat @llvm.amdgcn.rsq.bf16(bfloat %{{.*}})
+// OGCG: define{{.*}} void @test_rsq_bf16
+// OGCG: call{{.*}} bfloat @llvm.amdgcn.rsq.bf16(bfloat %{{.*}})
+void test_rsq_bf16(__bf16* out, __bf16 a)
+{
+  *out = __builtin_amdgcn_rsq_bf16(a);
+}
diff --git a/clang/test/CIR/CodeGen/OpenCL/builtins-amdgcn-vi.cl b/clang/test/CIR/CodeGen/OpenCL/builtins-amdgcn-vi.cl
@@ -76,3 +76,14 @@ void test_rcp_f16(global half* out, half a)
 {
   *out = __builtin_amdgcn_rcph(a);
 }
+
+// CIR-LABEL: @test_rsq_f16
+// CIR: cir.llvm.intrinsic "amdgcn.rsq" {{.*}} : (!cir.f16) -> !cir.f16
+// LLVM: define{{.*}} void @test_rsq_f16
+// LLVM: call{{.*}} half @llvm.amdgcn.rsq.f16(half %{{.*}})
+// OGCG: define{{.*}} void @test_rsq_f16
+// OGCG: call{{.*}} half @llvm.amdgcn.rsq.f16(half %{{.*}})
+void test_rsq_f16(global half* out, half a)
+{
+  *out = __builtin_amdgcn_rsqh(a);
+}
diff --git a/clang/test/CIR/CodeGen/OpenCL/builtins_amdgcn.cl b/clang/test/CIR/CodeGen/OpenCL/builtins_amdgcn.cl
@@ -380,3 +380,43 @@ void test_rcp_f32(global float* out, float a) {
 void test_rcp_f64(global double* out, double a) {
   *out = __builtin_amdgcn_rcp(a);
 }
+
+// CIR-LABEL: @test_rsq_f32
+// CIR: cir.llvm.intrinsic "amdgcn.rsq" {{.*}} : (!cir.float) -> !cir.float
+// LLVM: define{{.*}} void @test_rsq_f32
+// LLVM: call{{.*}} float @llvm.amdgcn.rsq.f32(float %{{.*}})
+// OGCG: define{{.*}} void @test_rsq_f32
+// OGCG: call{{.*}} float @llvm.amdgcn.rsq.f32(float %{{.*}})
+void test_rsq_f32(global float* out, float a) {
+  *out = __builtin_amdgcn_rsqf(a);
+}
+
+// CIR-LABEL: @test_rsq_f64
+// CIR: cir.llvm.intrinsic "amdgcn.rsq" {{.*}} : (!cir.double) -> !cir.double
+// LLVM: define{{.*}} void @test_rsq_f64
+// LLVM: call{{.*}} double @llvm.amdgcn.rsq.f64(double %{{.*}})
+// OGCG: define{{.*}} void @test_rsq_f64
+// OGCG: call{{.*}} double @llvm.amdgcn.rsq.f64(double %{{.*}})
+void test_rsq_f64(global double* out, double a) {
+  *out = __builtin_amdgcn_rsq(a);
+}
+
+// CIR-LABEL: @test_rsq_clamp_f32
+// CIR: cir.llvm.intrinsic "amdgcn.rsq.clamp" {{.*}} : (!cir.float) -> !cir.float
+// LLVM: define{{.*}} void @test_rsq_clamp_f32
+// LLVM: call{{.*}} float @llvm.amdgcn.rsq.clamp.f32(float %{{.*}})
+// OGCG: define{{.*}} void @test_rsq_clamp_f32
+// OGCG: call{{.*}} float @llvm.amdgcn.rsq.clamp.f32(float %{{.*}})
+void test_rsq_clamp_f32(global float* out, float a) {
+  *out = __builtin_amdgcn_rsq_clampf(a);
+}
+
+// CIR-LABEL: @test_rsq_clamp_f64
+// CIR: cir.llvm.intrinsic "amdgcn.rsq.clamp" {{.*}} : (!cir.double) -> !cir.double
+// LLVM: define{{.*}} void @test_rsq_clamp_f64
+// LLVM: call{{.*}} double @llvm.amdgcn.rsq.clamp.f64(double %{{.*}})
+// OGCG: define{{.*}} void @test_rsq_clamp_f64
+// OGCG: call{{.*}} double @llvm.amdgcn.rsq.clamp.f64(double %{{.*}})
+void test_rsq_clamp_f64(global double* out, double a) {
+  *out = __builtin_amdgcn_rsq_clamp(a);
+}