llvm.org GIT mirror llvm / 4b07ed6
Merging r323908: ------------------------------------------------------------------------ r323908 | mareko | 2018-01-31 21:18:04 +0100 (Wed, 31 Jan 2018) | 7 lines AMDGPU: Add intrinsics llvm.amdgcn.cvt.{pknorm.i16, pknorm.u16, pk.i16, pk.u16} Reviewers: arsenm, nhaehnle Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D41663 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_60@324103 91177308-0d34-0410-b5e6-96231b3b80d8 Hans Wennborg 1 year, 7 months ago
12 changed file(s) with 702 addition(s) and 8 deletion(s). Raw diff Collapse all Expand all
234234
235235 def int_amdgcn_cvt_pkrtz : Intrinsic<
236236 [llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty],
237 [IntrNoMem, IntrSpeculatable]
238 >;
239
240 def int_amdgcn_cvt_pknorm_i16 : Intrinsic<
241 [llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
242 [IntrNoMem, IntrSpeculatable]
243 >;
244
245 def int_amdgcn_cvt_pknorm_u16 : Intrinsic<
246 [llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
247 [IntrNoMem, IntrSpeculatable]
248 >;
249
250 def int_amdgcn_cvt_pk_i16 : Intrinsic<
251 [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
252 [IntrNoMem, IntrSpeculatable]
253 >;
254
255 def int_amdgcn_cvt_pk_u16 : Intrinsic<
256 [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
237257 [IntrNoMem, IntrSpeculatable]
238258 >;
239259
39563956 NODE_NAME_CASE(CVT_F32_UBYTE2)
39573957 NODE_NAME_CASE(CVT_F32_UBYTE3)
39583958 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
3959 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
3960 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
3961 NODE_NAME_CASE(CVT_PK_I16_I32)
3962 NODE_NAME_CASE(CVT_PK_U16_U32)
39593963 NODE_NAME_CASE(FP_TO_FP16)
39603964 NODE_NAME_CASE(FP16_ZEXT)
39613965 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
416416 // Convert two float 32 numbers into a single register holding two packed f16
417417 // with round to zero.
418418 CVT_PKRTZ_F16_F32,
419 CVT_PKNORM_I16_F32,
420 CVT_PKNORM_U16_F32,
421 CVT_PK_I16_I32,
422 CVT_PK_U16_U32,
419423
420424 // Same as the standard node, except the high bits of the resulting integer
421425 // are known 0.
3232
3333 def AMDGPUFPPackOp : SDTypeProfile<1, 2,
3434 [SDTCisFP<1>, SDTCisSameAs<1, 2>]
35 >;
36
37 def AMDGPUIntPackOp : SDTypeProfile<1, 2,
38 [SDTCisInt<1>, SDTCisSameAs<1, 2>]
3539 >;
3640
3741 def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
141145 def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
142146
143147 def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
148 def AMDGPUpknorm_i16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
149 def AMDGPUpknorm_u16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
150 def AMDGPUpk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
151 def AMDGPUpk_u16_u32 : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
144152 def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
145153 def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
146154
204204 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
205205 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
206206 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
207 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
207208 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
208209
209210 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
35163517 }
35173518 case ISD::INTRINSIC_WO_CHAIN: {
35183519 unsigned IID = cast(N->getOperand(0))->getZExtValue();
3519 if (IID == Intrinsic::amdgcn_cvt_pkrtz) {
3520 switch (IID) {
3521 case Intrinsic::amdgcn_cvt_pkrtz: {
35203522 SDValue Src0 = N->getOperand(1);
35213523 SDValue Src1 = N->getOperand(2);
35223524 SDLoc SL(N);
35243526 Src0, Src1);
35253527 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
35263528 return;
3529 }
3530 case Intrinsic::amdgcn_cvt_pknorm_i16:
3531 case Intrinsic::amdgcn_cvt_pknorm_u16:
3532 case Intrinsic::amdgcn_cvt_pk_i16:
3533 case Intrinsic::amdgcn_cvt_pk_u16: {
3534 SDValue Src0 = N->getOperand(1);
3535 SDValue Src1 = N->getOperand(2);
3536 SDLoc SL(N);
3537 unsigned Opcode;
3538
3539 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
3540 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
3541 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
3542 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
3543 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
3544 Opcode = AMDGPUISD::CVT_PK_I16_I32;
3545 else
3546 Opcode = AMDGPUISD::CVT_PK_U16_U32;
3547
3548 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
3549 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
3550 return;
3551 }
35273552 }
35283553 break;
35293554 }
44234448 case Intrinsic::amdgcn_ubfe:
44244449 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
44254450 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4426 case Intrinsic::amdgcn_cvt_pkrtz: {
4427 // FIXME: Stop adding cast if v2f16 legal.
4451 case Intrinsic::amdgcn_cvt_pkrtz:
4452 case Intrinsic::amdgcn_cvt_pknorm_i16:
4453 case Intrinsic::amdgcn_cvt_pknorm_u16:
4454 case Intrinsic::amdgcn_cvt_pk_i16:
4455 case Intrinsic::amdgcn_cvt_pk_u16: {
4456 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
44284457 EVT VT = Op.getValueType();
4429 SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
4458 unsigned Opcode;
4459
4460 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
4461 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
4462 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
4463 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
4464 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
4465 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
4466 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
4467 Opcode = AMDGPUISD::CVT_PK_I16_I32;
4468 else
4469 Opcode = AMDGPUISD::CVT_PK_U16_U32;
4470
4471 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
44304472 Op.getOperand(1), Op.getOperand(2));
44314473 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
44324474 }
406406 defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT, int_amdgcn_mbcnt_hi>;
407407 defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT, AMDGPUldexp>;
408408 defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT>; // TODO: set "Uses = dst"
409 defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT>;
410 defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT>;
409 defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT, AMDGPUpknorm_i16_f32>;
410 defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT, AMDGPUpknorm_u16_f32>;
411411 defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT, AMDGPUpkrtz_f16_f32>;
412 defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT>;
413 defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT>;
412 defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT, AMDGPUpk_u16_u32>;
413 defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT, AMDGPUpk_i16_i32>;
414414
415415 } // End SubtargetPredicate = isGCN
416416
32633263
32643264 break;
32653265 }
3266 case Intrinsic::amdgcn_cvt_pknorm_i16:
3267 case Intrinsic::amdgcn_cvt_pknorm_u16:
3268 case Intrinsic::amdgcn_cvt_pk_i16:
3269 case Intrinsic::amdgcn_cvt_pk_u16: {
3270 Value *Src0 = II->getArgOperand(0);
3271 Value *Src1 = II->getArgOperand(1);
3272
3273 if (isa(Src0) && isa(Src1))
3274 return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
3275
3276 break;
3277 }
32663278 case Intrinsic::amdgcn_ubfe:
32673279 case Intrinsic::amdgcn_sbfe: {
32683280 // Decompose simple cases into standard shifts.
0 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
2 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
4
5 ; GCN-LABEL: {{^}}s_cvt_pk_i16_i32:
6 ; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
7 ; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
8 ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
9 ; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[X]], [[VY]]
10 ; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[X]], [[VY]]
11 define amdgpu_kernel void @s_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
12 %result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %y)
13 %r = bitcast <2 x i16> %result to i32
14 store i32 %r, i32 addrspace(1)* %out
15 ret void
16 }
17
18 ; GCN-LABEL: {{^}}s_cvt_pk_i16_samereg_i32:
19 ; GCN: s_load_dword [[X:s[0-9]+]]
20 ; GCN: v_cvt_pk_i16_i32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
21 define amdgpu_kernel void @s_cvt_pk_i16_samereg_i32(i32 addrspace(1)* %out, i32 %x) #0 {
22 %result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %x)
23 %r = bitcast <2 x i16> %result to i32
24 store i32 %r, i32 addrspace(1)* %out
25 ret void
26 }
27
28 ; GCN-LABEL: {{^}}v_cvt_pk_i16_i32:
29 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
30 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
31 ; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[A]], [[B]]
32 ; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[A]], [[B]]
33 define amdgpu_kernel void @v_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
34 %tid = call i32 @llvm.amdgcn.workitem.id.x()
35 %tid.ext = sext i32 %tid to i64
36 %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
37 %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i64 %tid.ext
38 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
39 %a = load volatile i32, i32 addrspace(1)* %a.gep
40 %b = load volatile i32, i32 addrspace(1)* %b.gep
41 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %a, i32 %b)
42 %r = bitcast <2 x i16> %cvt to i32
43 store i32 %r, i32 addrspace(1)* %out.gep
44 ret void
45 }
46
47 ; GCN-LABEL: {{^}}v_cvt_pk_i16_i32_reg_imm:
48 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
49 ; GCN: v_cvt_pk_i16_i32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1
50 define amdgpu_kernel void @v_cvt_pk_i16_i32_reg_imm(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
51 %tid = call i32 @llvm.amdgcn.workitem.id.x()
52 %tid.ext = sext i32 %tid to i64
53 %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
54 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
55 %a = load volatile i32, i32 addrspace(1)* %a.gep
56 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %a, i32 1)
57 %r = bitcast <2 x i16> %cvt to i32
58 store i32 %r, i32 addrspace(1)* %out.gep
59 ret void
60 }
61
62 ; GCN-LABEL: {{^}}v_cvt_pk_i16_i32_imm_reg:
63 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
64 ; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, 1, [[A]]
65 ; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, 1, [[A]]
66 define amdgpu_kernel void @v_cvt_pk_i16_i32_imm_reg(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
67 %tid = call i32 @llvm.amdgcn.workitem.id.x()
68 %tid.ext = sext i32 %tid to i64
69 %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
70 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
71 %a = load volatile i32, i32 addrspace(1)* %a.gep
72 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 1, i32 %a)
73 %r = bitcast <2 x i16> %cvt to i32
74 store i32 %r, i32 addrspace(1)* %out.gep
75 ret void
76 }
77
78 declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) #1
79 declare i32 @llvm.amdgcn.workitem.id.x() #1
80
81
82 attributes #0 = { nounwind }
83 attributes #1 = { nounwind readnone }
0 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
2 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
4
5 ; GCN-LABEL: {{^}}s_cvt_pk_u16_u32:
6 ; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
7 ; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
8 ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
9 ; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[X]], [[VY]]
10 ; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[X]], [[VY]]
11 define amdgpu_kernel void @s_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
12 %result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %y)
13 %r = bitcast <2 x i16> %result to i32
14 store i32 %r, i32 addrspace(1)* %out
15 ret void
16 }
17
18 ; GCN-LABEL: {{^}}s_cvt_pk_u16_samereg_i32:
19 ; GCN: s_load_dword [[X:s[0-9]+]]
20 ; GCN: v_cvt_pk_u16_u32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
21 define amdgpu_kernel void @s_cvt_pk_u16_samereg_i32(i32 addrspace(1)* %out, i32 %x) #0 {
22 %result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %x)
23 %r = bitcast <2 x i16> %result to i32
24 store i32 %r, i32 addrspace(1)* %out
25 ret void
26 }
27
28 ; GCN-LABEL: {{^}}v_cvt_pk_u16_u32:
29 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
30 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
31 ; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[A]], [[B]]
32 ; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[A]], [[B]]
33 define amdgpu_kernel void @v_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
34 %tid = call i32 @llvm.amdgcn.workitem.id.x()
35 %tid.ext = sext i32 %tid to i64
36 %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
37 %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i64 %tid.ext
38 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
39 %a = load volatile i32, i32 addrspace(1)* %a.gep
40 %b = load volatile i32, i32 addrspace(1)* %b.gep
41 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %a, i32 %b)
42 %r = bitcast <2 x i16> %cvt to i32
43 store i32 %r, i32 addrspace(1)* %out.gep
44 ret void
45 }
46
47 ; GCN-LABEL: {{^}}v_cvt_pk_u16_u32_reg_imm:
48 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
49 ; GCN: v_cvt_pk_u16_u32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1
50 define amdgpu_kernel void @v_cvt_pk_u16_u32_reg_imm(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
51 %tid = call i32 @llvm.amdgcn.workitem.id.x()
52 %tid.ext = sext i32 %tid to i64
53 %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
54 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
55 %a = load volatile i32, i32 addrspace(1)* %a.gep
56 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %a, i32 1)
57 %r = bitcast <2 x i16> %cvt to i32
58 store i32 %r, i32 addrspace(1)* %out.gep
59 ret void
60 }
61
62 ; GCN-LABEL: {{^}}v_cvt_pk_u16_u32_imm_reg:
63 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
64 ; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, 1, [[A]]
65 ; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, 1, [[A]]
66 define amdgpu_kernel void @v_cvt_pk_u16_u32_imm_reg(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
67 %tid = call i32 @llvm.amdgcn.workitem.id.x()
68 %tid.ext = sext i32 %tid to i64
69 %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
70 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
71 %a = load volatile i32, i32 addrspace(1)* %a.gep
72 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 1, i32 %a)
73 %r = bitcast <2 x i16> %cvt to i32
74 store i32 %r, i32 addrspace(1)* %out.gep
75 ret void
76 }
77
78 declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) #1
79 declare i32 @llvm.amdgcn.workitem.id.x() #1
80
81
82 attributes #0 = { nounwind }
83 attributes #1 = { nounwind readnone }
0 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
2 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
4
5 ; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32:
6 ; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
7 ; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
8 ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
9 ; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
10 ; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[X]], [[VY]]
11 define amdgpu_kernel void @s_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
12 %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %y)
13 %r = bitcast <2 x i16> %result to i32
14 store i32 %r, i32 addrspace(1)* %out
15 ret void
16 }
17
18 ; GCN-LABEL: {{^}}s_cvt_pknorm_i16_samereg_f32:
19 ; GCN: s_load_dword [[X:s[0-9]+]]
20 ; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
21 define amdgpu_kernel void @s_cvt_pknorm_i16_samereg_f32(i32 addrspace(1)* %out, float %x) #0 {
22 %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %x)
23 %r = bitcast <2 x i16> %result to i32
24 store i32 %r, i32 addrspace(1)* %out
25 ret void
26 }
27
28 ; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32:
29 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
30 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
31 ; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
32 ; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[A]], [[B]]
33 define amdgpu_kernel void @v_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
34 %tid = call i32 @llvm.amdgcn.workitem.id.x()
35 %tid.ext = sext i32 %tid to i64
36 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
37 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
38 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
39 %a = load volatile float, float addrspace(1)* %a.gep
40 %b = load volatile float, float addrspace(1)* %b.gep
41 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float %b)
42 %r = bitcast <2 x i16> %cvt to i32
43 store i32 %r, i32 addrspace(1)* %out.gep
44 ret void
45 }
46
47 ; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_reg_imm:
48 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
49 ; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0
50 define amdgpu_kernel void @v_cvt_pknorm_i16_f32_reg_imm(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
51 %tid = call i32 @llvm.amdgcn.workitem.id.x()
52 %tid.ext = sext i32 %tid to i64
53 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
54 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
55 %a = load volatile float, float addrspace(1)* %a.gep
56 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float 1.0)
57 %r = bitcast <2 x i16> %cvt to i32
58 store i32 %r, i32 addrspace(1)* %out.gep
59 ret void
60 }
61
62 ; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_imm_reg:
63 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
64 ; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
65 ; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, 1.0, [[A]]
66 define amdgpu_kernel void @v_cvt_pknorm_i16_f32_imm_reg(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
67 %tid = call i32 @llvm.amdgcn.workitem.id.x()
68 %tid.ext = sext i32 %tid to i64
69 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
70 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
71 %a = load volatile float, float addrspace(1)* %a.gep
72 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float 1.0, float %a)
73 %r = bitcast <2 x i16> %cvt to i32
74 store i32 %r, i32 addrspace(1)* %out.gep
75 ret void
76 }
77
78 ; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_lo:
79 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
80 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
81 ; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]]
82 define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
83 %tid = call i32 @llvm.amdgcn.workitem.id.x()
84 %tid.ext = sext i32 %tid to i64
85 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
86 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
87 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
88 %a = load volatile float, float addrspace(1)* %a.gep
89 %b = load volatile float, float addrspace(1)* %b.gep
90 %neg.a = fsub float -0.0, %a
91 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.a, float %b)
92 %r = bitcast <2 x i16> %cvt to i32
93 store i32 %r, i32 addrspace(1)* %out.gep
94 ret void
95 }
96
97 ; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_hi:
98 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
99 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
100 ; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]]
101 define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
102 %tid = call i32 @llvm.amdgcn.workitem.id.x()
103 %tid.ext = sext i32 %tid to i64
104 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
105 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
106 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
107 %a = load volatile float, float addrspace(1)* %a.gep
108 %b = load volatile float, float addrspace(1)* %b.gep
109 %neg.b = fsub float -0.0, %b
110 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float %neg.b)
111 %r = bitcast <2 x i16> %cvt to i32
112 store i32 %r, i32 addrspace(1)* %out.gep
113 ret void
114 }
115
116 ; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_lo_hi:
117 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
118 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
119 ; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]]
120 define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
121 %tid = call i32 @llvm.amdgcn.workitem.id.x()
122 %tid.ext = sext i32 %tid to i64
123 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
124 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
125 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
126 %a = load volatile float, float addrspace(1)* %a.gep
127 %b = load volatile float, float addrspace(1)* %b.gep
128 %neg.a = fsub float -0.0, %a
129 %neg.b = fsub float -0.0, %b
130 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.a, float %neg.b)
131 %r = bitcast <2 x i16> %cvt to i32
132 store i32 %r, i32 addrspace(1)* %out.gep
133 ret void
134 }
135
136 ; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_fabs_lo_fneg_hi:
137 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
138 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
139 ; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]]
140 define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_fabs_lo_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
141 %tid = call i32 @llvm.amdgcn.workitem.id.x()
142 %tid.ext = sext i32 %tid to i64
143 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
144 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
145 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
146 %a = load volatile float, float addrspace(1)* %a.gep
147 %b = load volatile float, float addrspace(1)* %b.gep
148 %fabs.a = call float @llvm.fabs.f32(float %a)
149 %neg.fabs.a = fsub float -0.0, %fabs.a
150 %neg.b = fsub float -0.0, %b
151 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.fabs.a, float %neg.b)
152 %r = bitcast <2 x i16> %cvt to i32
153 store i32 %r, i32 addrspace(1)* %out.gep
154 ret void
155 }
156
157 declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) #1
158 declare float @llvm.fabs.f32(float) #1
159 declare i32 @llvm.amdgcn.workitem.id.x() #1
160
161
162 attributes #0 = { nounwind }
163 attributes #1 = { nounwind readnone }
0 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
2 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
4
5 ; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32:
6 ; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
7 ; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
8 ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
9 ; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
10 ; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[X]], [[VY]]
11 define amdgpu_kernel void @s_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
12 %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %y)
13 %r = bitcast <2 x i16> %result to i32
14 store i32 %r, i32 addrspace(1)* %out
15 ret void
16 }
17
18 ; GCN-LABEL: {{^}}s_cvt_pknorm_u16_samereg_f32:
19 ; GCN: s_load_dword [[X:s[0-9]+]]
20 ; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
21 define amdgpu_kernel void @s_cvt_pknorm_u16_samereg_f32(i32 addrspace(1)* %out, float %x) #0 {
22 %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %x)
23 %r = bitcast <2 x i16> %result to i32
24 store i32 %r, i32 addrspace(1)* %out
25 ret void
26 }
27
28 ; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32:
29 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
30 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
31 ; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
32 ; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[A]], [[B]]
33 define amdgpu_kernel void @v_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
34 %tid = call i32 @llvm.amdgcn.workitem.id.x()
35 %tid.ext = sext i32 %tid to i64
36 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
37 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
38 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
39 %a = load volatile float, float addrspace(1)* %a.gep
40 %b = load volatile float, float addrspace(1)* %b.gep
41 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float %b)
42 %r = bitcast <2 x i16> %cvt to i32
43 store i32 %r, i32 addrspace(1)* %out.gep
44 ret void
45 }
46
47 ; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_reg_imm:
48 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
49 ; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0
50 define amdgpu_kernel void @v_cvt_pknorm_u16_f32_reg_imm(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
51 %tid = call i32 @llvm.amdgcn.workitem.id.x()
52 %tid.ext = sext i32 %tid to i64
53 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
54 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
55 %a = load volatile float, float addrspace(1)* %a.gep
56 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float 1.0)
57 %r = bitcast <2 x i16> %cvt to i32
58 store i32 %r, i32 addrspace(1)* %out.gep
59 ret void
60 }
61
62 ; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_imm_reg:
63 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
64 ; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
65 ; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, 1.0, [[A]]
66 define amdgpu_kernel void @v_cvt_pknorm_u16_f32_imm_reg(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
67 %tid = call i32 @llvm.amdgcn.workitem.id.x()
68 %tid.ext = sext i32 %tid to i64
69 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
70 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
71 %a = load volatile float, float addrspace(1)* %a.gep
72 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float 1.0, float %a)
73 %r = bitcast <2 x i16> %cvt to i32
74 store i32 %r, i32 addrspace(1)* %out.gep
75 ret void
76 }
77
78 ; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_lo:
79 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
80 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
81 ; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]]
82 define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
83 %tid = call i32 @llvm.amdgcn.workitem.id.x()
84 %tid.ext = sext i32 %tid to i64
85 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
86 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
87 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
88 %a = load volatile float, float addrspace(1)* %a.gep
89 %b = load volatile float, float addrspace(1)* %b.gep
90 %neg.a = fsub float -0.0, %a
91 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.a, float %b)
92 %r = bitcast <2 x i16> %cvt to i32
93 store i32 %r, i32 addrspace(1)* %out.gep
94 ret void
95 }
96
97 ; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_hi:
98 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
99 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
100 ; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]]
101 define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
102 %tid = call i32 @llvm.amdgcn.workitem.id.x()
103 %tid.ext = sext i32 %tid to i64
104 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
105 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
106 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
107 %a = load volatile float, float addrspace(1)* %a.gep
108 %b = load volatile float, float addrspace(1)* %b.gep
109 %neg.b = fsub float -0.0, %b
110 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float %neg.b)
111 %r = bitcast <2 x i16> %cvt to i32
112 store i32 %r, i32 addrspace(1)* %out.gep
113 ret void
114 }
115
116 ; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_lo_hi:
117 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
118 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
119 ; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]]
120 define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
121 %tid = call i32 @llvm.amdgcn.workitem.id.x()
122 %tid.ext = sext i32 %tid to i64
123 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
124 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
125 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
126 %a = load volatile float, float addrspace(1)* %a.gep
127 %b = load volatile float, float addrspace(1)* %b.gep
128 %neg.a = fsub float -0.0, %a
129 %neg.b = fsub float -0.0, %b
130 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.a, float %neg.b)
131 %r = bitcast <2 x i16> %cvt to i32
132 store i32 %r, i32 addrspace(1)* %out.gep
133 ret void
134 }
135
136 ; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_fabs_lo_fneg_hi:
137 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
138 ; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
139 ; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]]
140 define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_fabs_lo_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
141 %tid = call i32 @llvm.amdgcn.workitem.id.x()
142 %tid.ext = sext i32 %tid to i64
143 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
144 %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
145 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
146 %a = load volatile float, float addrspace(1)* %a.gep
147 %b = load volatile float, float addrspace(1)* %b.gep
148 %fabs.a = call float @llvm.fabs.f32(float %a)
149 %neg.fabs.a = fsub float -0.0, %fabs.a
150 %neg.b = fsub float -0.0, %b
151 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.fabs.a, float %neg.b)
152 %r = bitcast <2 x i16> %cvt to i32
153 store i32 %r, i32 addrspace(1)* %out.gep
154 ret void
155 }
156
157 declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) #1
158 declare float @llvm.fabs.f32(float) #1
159 declare i32 @llvm.amdgcn.workitem.id.x() #1
160
161
162 attributes #0 = { nounwind }
163 attributes #1 = { nounwind readnone }
719719 define <2 x half> @constant_rtz_pkrtz() {
720720 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 65535.0, float 65535.0)
721721 ret <2 x half> %cvt
722 }
723
724 ; --------------------------------------------------------------------
725 ; llvm.amdgcn.cvt.pknorm.i16
726 ; --------------------------------------------------------------------
727
728 declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) nounwind readnone
729
730 ; CHECK-LABEL: @undef_lhs_cvt_pknorm_i16(
731 ; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y)
732 define <2 x i16> @undef_lhs_cvt_pknorm_i16(float %y) {
733 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y)
734 ret <2 x i16> %cvt
735 }
736
737 ; CHECK-LABEL: @undef_rhs_cvt_pknorm_i16(
738 ; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef)
739 define <2 x i16> @undef_rhs_cvt_pknorm_i16(float %x) {
740 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef)
741 ret <2 x i16> %cvt
742 }
743
744 ; CHECK-LABEL: @undef_cvt_pknorm_i16(
745 ; CHECK: ret <2 x i16> undef
746 define <2 x i16> @undef_cvt_pknorm_i16() {
747 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float undef)
748 ret <2 x i16> %cvt
749 }
750
751 ; --------------------------------------------------------------------
752 ; llvm.amdgcn.cvt.pknorm.u16
753 ; --------------------------------------------------------------------
754
755 declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) nounwind readnone
756
757 ; CHECK-LABEL: @undef_lhs_cvt_pknorm_u16(
758 ; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y)
759 define <2 x i16> @undef_lhs_cvt_pknorm_u16(float %y) {
760 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y)
761 ret <2 x i16> %cvt
762 }
763
764 ; CHECK-LABEL: @undef_rhs_cvt_pknorm_u16(
765 ; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef)
766 define <2 x i16> @undef_rhs_cvt_pknorm_u16(float %x) {
767 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef)
768 ret <2 x i16> %cvt
769 }
770
771 ; CHECK-LABEL: @undef_cvt_pknorm_u16(
772 ; CHECK: ret <2 x i16> undef
773 define <2 x i16> @undef_cvt_pknorm_u16() {
774 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float undef)
775 ret <2 x i16> %cvt
776 }
777
778 ; --------------------------------------------------------------------
779 ; llvm.amdgcn.cvt.pk.i16
780 ; --------------------------------------------------------------------
781
782 declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) nounwind readnone
783
784 ; CHECK-LABEL: @undef_lhs_cvt_pk_i16(
785 ; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y)
786 define <2 x i16> @undef_lhs_cvt_pk_i16(i32 %y) {
787 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y)
788 ret <2 x i16> %cvt
789 }
790
791 ; CHECK-LABEL: @undef_rhs_cvt_pk_i16(
792 ; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef)
793 define <2 x i16> @undef_rhs_cvt_pk_i16(i32 %x) {
794 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef)
795 ret <2 x i16> %cvt
796 }
797
798 ; CHECK-LABEL: @undef_cvt_pk_i16(
799 ; CHECK: ret <2 x i16> undef
800 define <2 x i16> @undef_cvt_pk_i16() {
801 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 undef)
802 ret <2 x i16> %cvt
803 }
804
805 ; --------------------------------------------------------------------
806 ; llvm.amdgcn.cvt.pk.u16
807 ; --------------------------------------------------------------------
808
809 declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) nounwind readnone
810
811 ; CHECK-LABEL: @undef_lhs_cvt_pk_u16(
812 ; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y)
813 define <2 x i16> @undef_lhs_cvt_pk_u16(i32 %y) {
814 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y)
815 ret <2 x i16> %cvt
816 }
817
818 ; CHECK-LABEL: @undef_rhs_cvt_pk_u16(
819 ; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef)
820 define <2 x i16> @undef_rhs_cvt_pk_u16(i32 %x) {
821 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef)
822 ret <2 x i16> %cvt
823 }
824
825 ; CHECK-LABEL: @undef_cvt_pk_u16(
826 ; CHECK: ret <2 x i16> undef
827 define <2 x i16> @undef_cvt_pk_u16() {
828 %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 undef)
829 ret <2 x i16> %cvt
722830 }
723831
724832 ; --------------------------------------------------------------------