llvm.org GIT mirror llvm / 4525054
AMDGPU: Make v2i16/v2f16 legal on VI This usually results in better code. Fixes using inline asm with short2, and also fixes having a different ABI for function parameters between VI and gfx9. Partially cleans up the mess used for lowering of the d16 operations. Making v4f16 legal will help clean this up more, but this requires additional work. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@332953 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 1 year, 9 months ago
37 changed file(s) with 952 addition(s) and 824 deletion(s). Raw diff Collapse all Expand all
798798 def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
799799 AssemblerPredicate<"FeatureVOP3P">;
800800
801 def NotHasVOP3PInsts : Predicate<"!Subtarget->hasVOP3PInsts()">,
802 AssemblerPredicate<"!FeatureVOP3P">;
803
801804 def HasSDWA : Predicate<"Subtarget->hasSDWA()">,
802805 AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">;
803806
30063006 SDValue X = LHS->getOperand(0);
30073007
30083008 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3009 isTypeLegal(MVT::v2i16)) {
3009 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
30103010 // Prefer build_vector as the canonical form if packed types are legal.
30113011 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
30123012 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
38173817 // TODO: Generalize and move to DAGCombiner
38183818 SDValue Src = N->getOperand(0);
38193819 if (ConstantSDNode *C = dyn_cast(Src)) {
3820 assert(Src.getValueType() == MVT::i64);
3821 SDLoc SL(N);
3822 uint64_t CVal = C->getZExtValue();
3823 return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
3824 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3825 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3820 if (Src.getValueType() == MVT::i64) {
3821 SDLoc SL(N);
3822 uint64_t CVal = C->getZExtValue();
3823 return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
3824 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3825 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3826 }
38263827 }
38273828
38283829 if (ConstantFPSDNode *C = dyn_cast(Src)) {
10591059 defm : MUBUF_LoadIntrinsicPat;
10601060
10611061 let SubtargetPredicate = HasUnpackedD16VMem in {
1062 defm : MUBUF_LoadIntrinsicPat, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
1062 defm : MUBUF_LoadIntrinsicPat_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
10631063 defm : MUBUF_LoadIntrinsicPat;
10641064 defm : MUBUF_LoadIntrinsicPat;
10651065 } // End HasUnpackedD16VMem.
10661066
10671067 let SubtargetPredicate = HasPackedD16VMem in {
1068 defm : MUBUF_LoadIntrinsicPat;
1069 defm : MUBUF_LoadIntrinsicPat, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
1068 defm : MUBUF_LoadIntrinsicPat_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">;
1069 defm : MUBUF_LoadIntrinsicPat;
10701070 defm : MUBUF_LoadIntrinsicPat;
10711071 defm : MUBUF_LoadIntrinsicPat;
10721072 } // End HasPackedD16VMem.
15461546 defm : MTBUF_LoadIntrinsicPat;
15471547
15481548 let SubtargetPredicate = HasUnpackedD16VMem in {
1549 defm : MTBUF_LoadIntrinsicPat, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
1549 defm : MTBUF_LoadIntrinsicPat_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
15501550 defm : MTBUF_LoadIntrinsicPat;
15511551 defm : MTBUF_LoadIntrinsicPat;
15521552 } // End HasUnpackedD16VMem.
15531553
15541554 let SubtargetPredicate = HasPackedD16VMem in {
1555 defm : MTBUF_LoadIntrinsicPat;
1556 defm : MTBUF_LoadIntrinsicPat, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">;
1555 defm : MTBUF_LoadIntrinsicPat_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">;
1556 defm : MTBUF_LoadIntrinsicPat;
15571557 defm : MTBUF_LoadIntrinsicPat;
15581558 defm : MTBUF_LoadIntrinsicPat;
15591559 } // End HasPackedD16VMem.
609609
610610 let SubtargetPredicate = HasPackedD16VMem in {
611611 def _packed_v1 : ImageDimPattern;
612 // used on gfx810
613 def _packed_v2 : ImageDimPattern;
614 // used on gfx900
615 def _packed_v2_gfx9 : ImageDimPattern;
612 def _packed_v2 : ImageDimPattern;
616613 def _packed_v4 : ImageDimPattern;
617614 } // End HasPackedD16VMem.
618615 }
716713 } // End HasUnpackedD16VMem.
717714
718715 let SubtargetPredicate = HasPackedD16VMem in {
719 defm : ImageSampleDataPatterns(opcode # _V1), i32, "_D16">;
716 defm : ImageSampleDataPatterns(opcode # _V1), f16, "_D16">;
720717 defm : ImageSampleDataPatterns(opcode # _V2), v2i32, "_D16">;
721718 } // End HasPackedD16VMem.
722719 }
779776 } // End HasUnPackedD16VMem.
780777
781778 let SubtargetPredicate = HasPackedD16VMem in {
782 defm : ImageLoadDataPatterns(opcode # _V1), i32, "_D16">;
779 defm : ImageLoadDataPatterns(opcode # _V1), f16, "_D16">;
783780 defm : ImageLoadDataPatterns(opcode # _V2), v2i32, "_D16">;
784781 } // End HasPackedD16VMem.
785782 }
864861 defm : ImageLoadAltPatterns;
865862
866863 // Image store.
867 defm : ImageStorePatterns;
868 defm : ImageStorePatterns<SIImage_store_mip, "IMAGE_STORE_MIP">;
864 defm : ImageStorePatterns<int_amdgcn_image_store, "IMAGE_STORE">;
865 defm : ImageStorePatterns;
869866 defm : ImageStoreAltPatterns;
870867 defm : ImageStoreAltPatterns;
871868
138138 if (Subtarget->has16BitInsts()) {
139139 addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
140140 addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
141 }
142
143 if (Subtarget->hasVOP3PInsts()) {
141
142 // Unless there are also VOP3P operations, not operations are really legal.
144143 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
145144 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
146145 }
173172
174173 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
175174 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
176 setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
177175
178176 setOperationAction(ISD::SELECT, MVT::i1, Promote);
179177 setOperationAction(ISD::SELECT, MVT::i64, Custom);
422420 setOperationAction(ISD::FMA, MVT::f16, Legal);
423421 if (!Subtarget->hasFP16Denormals())
424422 setOperationAction(ISD::FMAD, MVT::f16, Legal);
425 }
426
427 if (Subtarget->hasVOP3PInsts()) {
423
428424 for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
429425 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
430426 switch (Op) {
471467 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
472468 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
473469 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
474 setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
475 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
476 setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
477 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
478
470
471 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
472 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
473 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
474 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
475
476 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
477 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
478 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
479 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
480
481 setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
482 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
483 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
484 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
485
486 if (!Subtarget->hasVOP3PInsts()) {
487 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
488 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
489 }
490
491 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
492 // This isn't really legal, but this avoids the legalizer unrolling it (and
493 // allows matching fneg (fabs x) patterns)
494 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
495 }
496
497 if (Subtarget->hasVOP3PInsts()) {
479498 setOperationAction(ISD::ADD, MVT::v2i16, Legal);
480499 setOperationAction(ISD::SUB, MVT::v2i16, Legal);
481500 setOperationAction(ISD::MUL, MVT::v2i16, Legal);
488507 setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
489508
490509 setOperationAction(ISD::FADD, MVT::v2f16, Legal);
491 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
492510 setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
493511 setOperationAction(ISD::FMA, MVT::v2f16, Legal);
494512 setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
495513 setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
496514 setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
497515
498 // This isn't really legal, but this avoids the legalizer unrolling it (and
499 // allows matching fneg (fabs x) patterns)
500 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
501
502516 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
503517 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
504
505 setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
506 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
507 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
508 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
518 }
519
520 if (Subtarget->has16BitInsts()) {
521 setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
522 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
523 setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
524 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
509525 } else {
526 // Legalization hack.
510527 setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
511528 setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
512529 }
35133530 return 0;
35143531 }
35153532
3516 static SDValue adjustLoadValueType(SDValue Result, EVT LoadVT, SDLoc DL,
3517 SelectionDAG &DAG, bool Unpacked) {
3533 static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
3534 const SDLoc &DL,
3535 SelectionDAG &DAG, bool Unpacked) {
3536 if (!LoadVT.isVector())
3537 return Result;
3538
35183539 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
35193540 // Truncate to v2i16/v4i16.
35203541 EVT IntLoadVT = LoadVT.changeTypeToInteger();
3521 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntLoadVT, Result);
3542
3543 // Workaround legalizer not scalarizing truncate after vector op
3544 // legalization byt not creating intermediate vector trunc.
3545 SmallVector Elts;
3546 DAG.ExtractVectorElements(Result, Elts);
3547 for (SDValue &Elt : Elts)
3548 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
3549
3550 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
3551
35223552 // Bitcast to original type (v2f16/v4f16).
3523 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
3524 }
3553 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3554 }
3555
35253556 // Cast back to the original packed type.
35263557 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
35273558 }
35283559
3529 // This is to lower INTRINSIC_W_CHAIN with illegal result types.
3530 SDValue SITargetLowering::lowerIntrinsicWChain_IllegalReturnType(SDValue Op,
3531 SDValue &Chain, SelectionDAG &DAG) const {
3532 EVT LoadVT = Op.getValueType();
3533 // TODO: handle v3f16.
3534 if (LoadVT != MVT::v2f16 && LoadVT != MVT::v4f16)
3535 return SDValue();
3560 SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
3561 MemSDNode *M,
3562 SelectionDAG &DAG,
3563 bool IsIntrinsic) const {
3564 SDLoc DL(M);
3565 SmallVector Ops;
3566 Ops.reserve(M->getNumOperands());
3567
3568 Ops.push_back(M->getOperand(0));
3569 if (IsIntrinsic)
3570 Ops.push_back(DAG.getConstant(Opcode, DL, MVT::i32));
3571
3572 // Skip 1, as it is the intrinsic ID.
3573 for (unsigned I = 2, E = M->getNumOperands(); I != E; ++I)
3574 Ops.push_back(M->getOperand(I));
35363575
35373576 bool Unpacked = Subtarget->hasUnpackedD16VMem();
3538 EVT UnpackedLoadVT = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
3539 EVT EquivLoadVT = Unpacked ? UnpackedLoadVT :
3540 getEquivalentMemType(*DAG.getContext(), LoadVT);
3577 EVT LoadVT = M->getValueType(0);
3578
3579 EVT UnpackedLoadVT = LoadVT.isVector() ?
3580 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3581 LoadVT.getVectorNumElements()) : LoadVT;
3582 EVT EquivLoadVT = LoadVT;
3583 if (LoadVT.isVector()) {
3584 EquivLoadVT = Unpacked ? UnpackedLoadVT :
3585 getEquivalentMemType(*DAG.getContext(), LoadVT);
3586 }
3587
35413588 // Change from v4f16/v2f16 to EquivLoadVT.
35423589 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
35433590
3544 SDValue Res;
3545 SDLoc DL(Op);
3546 MemSDNode *M = cast(Op);
3547 unsigned IID = cast(Op.getOperand(1))->getZExtValue();
3548 switch (IID) {
3549 case Intrinsic::amdgcn_tbuffer_load: {
3550 SDValue Ops[] = {
3551 Op.getOperand(0), // Chain
3552 Op.getOperand(2), // rsrc
3553 Op.getOperand(3), // vindex
3554 Op.getOperand(4), // voffset
3555 Op.getOperand(5), // soffset
3556 Op.getOperand(6), // offset
3557 Op.getOperand(7), // dfmt
3558 Op.getOperand(8), // nfmt
3559 Op.getOperand(9), // glc
3560 Op.getOperand(10) // slc
3561 };
3562 Res = DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, DL,
3563 VTList, Ops, M->getMemoryVT(),
3564 M->getMemOperand());
3565 Chain = Res.getValue(1);
3566 return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
3567 }
3568 case Intrinsic::amdgcn_buffer_load_format: {
3569 SDValue Ops[] = {
3570 Op.getOperand(0), // Chain
3571 Op.getOperand(2), // rsrc
3572 Op.getOperand(3), // vindex
3573 Op.getOperand(4), // offset
3574 Op.getOperand(5), // glc
3575 Op.getOperand(6) // slc
3576 };
3577 Res = DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
3578 DL, VTList, Ops, M->getMemoryVT(),
3579 M->getMemOperand());
3580 Chain = Res.getValue(1);
3581 return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
3582 }
3583 case Intrinsic::amdgcn_image_load:
3584 case Intrinsic::amdgcn_image_load_mip: {
3585 SDValue Ops[] = {
3586 Op.getOperand(0), // Chain
3587 Op.getOperand(2), // vaddr
3588 Op.getOperand(3), // rsrc
3589 Op.getOperand(4), // dmask
3590 Op.getOperand(5), // glc
3591 Op.getOperand(6), // slc
3592 Op.getOperand(7), // lwe
3593 Op.getOperand(8) // da
3594 };
3595 unsigned Opc = getImageOpcode(IID);
3596 Res = DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(),
3597 M->getMemOperand());
3598 Chain = Res.getValue(1);
3599 return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
3600 }
3601 // Basic sample.
3602 case Intrinsic::amdgcn_image_sample:
3603 case Intrinsic::amdgcn_image_sample_cl:
3604 case Intrinsic::amdgcn_image_sample_d:
3605 case Intrinsic::amdgcn_image_sample_d_cl:
3606 case Intrinsic::amdgcn_image_sample_l:
3607 case Intrinsic::amdgcn_image_sample_b:
3608 case Intrinsic::amdgcn_image_sample_b_cl:
3609 case Intrinsic::amdgcn_image_sample_lz:
3610 case Intrinsic::amdgcn_image_sample_cd:
3611 case Intrinsic::amdgcn_image_sample_cd_cl:
3612
3613 // Sample with comparison.
3614 case Intrinsic::amdgcn_image_sample_c:
3615 case Intrinsic::amdgcn_image_sample_c_cl:
3616 case Intrinsic::amdgcn_image_sample_c_d:
3617 case Intrinsic::amdgcn_image_sample_c_d_cl:
3618 case Intrinsic::amdgcn_image_sample_c_l:
3619 case Intrinsic::amdgcn_image_sample_c_b:
3620 case Intrinsic::amdgcn_image_sample_c_b_cl:
3621 case Intrinsic::amdgcn_image_sample_c_lz:
3622 case Intrinsic::amdgcn_image_sample_c_cd:
3623 case Intrinsic::amdgcn_image_sample_c_cd_cl:
3624
3625 // Sample with offsets.
3626 case Intrinsic::amdgcn_image_sample_o:
3627 case Intrinsic::amdgcn_image_sample_cl_o:
3628 case Intrinsic::amdgcn_image_sample_d_o:
3629 case Intrinsic::amdgcn_image_sample_d_cl_o:
3630 case Intrinsic::amdgcn_image_sample_l_o:
3631 case Intrinsic::amdgcn_image_sample_b_o:
3632 case Intrinsic::amdgcn_image_sample_b_cl_o:
3633 case Intrinsic::amdgcn_image_sample_lz_o:
3634 case Intrinsic::amdgcn_image_sample_cd_o:
3635 case Intrinsic::amdgcn_image_sample_cd_cl_o:
3636
3637 // Sample with comparison and offsets.
3638 case Intrinsic::amdgcn_image_sample_c_o:
3639 case Intrinsic::amdgcn_image_sample_c_cl_o:
3640 case Intrinsic::amdgcn_image_sample_c_d_o:
3641 case Intrinsic::amdgcn_image_sample_c_d_cl_o:
3642 case Intrinsic::amdgcn_image_sample_c_l_o:
3643 case Intrinsic::amdgcn_image_sample_c_b_o:
3644 case Intrinsic::amdgcn_image_sample_c_b_cl_o:
3645 case Intrinsic::amdgcn_image_sample_c_lz_o:
3646 case Intrinsic::amdgcn_image_sample_c_cd_o:
3647 case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
3648
3649 // Basic gather4
3650 case Intrinsic::amdgcn_image_gather4:
3651 case Intrinsic::amdgcn_image_gather4_cl:
3652 case Intrinsic::amdgcn_image_gather4_l:
3653 case Intrinsic::amdgcn_image_gather4_b:
3654 case Intrinsic::amdgcn_image_gather4_b_cl:
3655 case Intrinsic::amdgcn_image_gather4_lz:
3656
3657 // Gather4 with comparison
3658 case Intrinsic::amdgcn_image_gather4_c:
3659 case Intrinsic::amdgcn_image_gather4_c_cl:
3660 case Intrinsic::amdgcn_image_gather4_c_l:
3661 case Intrinsic::amdgcn_image_gather4_c_b:
3662 case Intrinsic::amdgcn_image_gather4_c_b_cl:
3663 case Intrinsic::amdgcn_image_gather4_c_lz:
3664
3665 // Gather4 with offsets
3666 case Intrinsic::amdgcn_image_gather4_o:
3667 case Intrinsic::amdgcn_image_gather4_cl_o:
3668 case Intrinsic::amdgcn_image_gather4_l_o:
3669 case Intrinsic::amdgcn_image_gather4_b_o:
3670 case Intrinsic::amdgcn_image_gather4_b_cl_o:
3671 case Intrinsic::amdgcn_image_gather4_lz_o:
3672
3673 // Gather4 with comparison and offsets
3674 case Intrinsic::amdgcn_image_gather4_c_o:
3675 case Intrinsic::amdgcn_image_gather4_c_cl_o:
3676 case Intrinsic::amdgcn_image_gather4_c_l_o:
3677 case Intrinsic::amdgcn_image_gather4_c_b_o:
3678 case Intrinsic::amdgcn_image_gather4_c_b_cl_o:
3679 case Intrinsic::amdgcn_image_gather4_c_lz_o: {
3680 SDValue Ops[] = {
3681 Op.getOperand(0), // Chain
3682 Op.getOperand(2), // vaddr
3683 Op.getOperand(3), // rsrc
3684 Op.getOperand(4), // sampler
3685 Op.getOperand(5), // dmask
3686 Op.getOperand(6), // unorm
3687 Op.getOperand(7), // glc
3688 Op.getOperand(8), // slc
3689 Op.getOperand(9), // lwe
3690 Op.getOperand(10) // da
3691 };
3692 unsigned Opc = getImageOpcode(IID);
3693 Res = DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(),
3694 M->getMemOperand());
3695 Chain = Res.getValue(1);
3696 return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
3697 }
3698 default: {
3699 const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr =
3700 AMDGPU::lookupD16ImageDimIntrinsicByIntr(IID);
3701 if (D16ImageDimIntr) {
3702 SmallVector Ops;
3703 for (auto Value : Op.getNode()->op_values())
3704 Ops.push_back(Value);
3705 Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32);
3706 Res = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTList, Ops,
3707 M->getMemoryVT(), M->getMemOperand());
3708 Chain = Res.getValue(1);
3709 return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
3710 }
3711
3712 return SDValue();
3713 }
3714 }
3591 SDValue Load
3592 = DAG.getMemIntrinsicNode(IsIntrinsic ? ISD::INTRINSIC_W_CHAIN : Opcode, DL,
3593 VTList, Ops, M->getMemoryVT(),
3594 M->getMemOperand());
3595
3596 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
3597
3598 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
37153599 }
37163600
37173601 void SITargetLowering::ReplaceNodeResults(SDNode *N,
37663650 break;
37673651 }
37683652 case ISD::INTRINSIC_W_CHAIN: {
3769 SDValue Chain;
3770 if (SDValue Res = lowerIntrinsicWChain_IllegalReturnType(SDValue(N, 0),
3771 Chain, DAG)) {
3653 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
37723654 Results.push_back(Res);
3773 Results.push_back(Chain);
3655 Results.push_back(Res.getValue(1));
37743656 return;
37753657 }
3658
37763659 break;
37773660 }
37783661 case ISD::SELECT: {
42784161 SelectionDAG &DAG) const {
42794162 SDLoc SL(Op);
42804163 EVT VT = Op.getValueType();
4281 assert(VT == MVT::v4i16 || VT == MVT::v4f16);
4282
4283 EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
4284
4285 // Turn into pair of packed build_vectors.
4286 // TODO: Special case for constants that can be materialized with s_mov_b64.
4287 SDValue Lo = DAG.getBuildVector(HalfVT, SL,
4288 { Op.getOperand(0), Op.getOperand(1) });
4289 SDValue Hi = DAG.getBuildVector(HalfVT, SL,
4290 { Op.getOperand(2), Op.getOperand(3) });
4291
4292 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
4293 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
4294
4295 SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
4296 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
4164
4165 assert(VT == MVT::v2f16 || VT == MVT::v2i16);
4166
4167 SDValue Lo = Op.getOperand(0);
4168 SDValue Hi = Op.getOperand(1);
4169
4170 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4171 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
4172
4173 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
4174 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
4175
4176 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
4177 DAG.getConstant(16, SL, MVT::i32));
4178
4179 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
4180
4181 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
42974182 }
42984183
42994184 bool
48284713 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
48294714 EVT VT = Op.getValueType();
48304715 EVT IntVT = VT.changeTypeToInteger();
4831
48324716 auto *M = cast(Op);
4717 EVT LoadVT = Op.getValueType();
4718 bool IsD16 = LoadVT.getScalarType() == MVT::f16;
4719 if (IsD16)
4720 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG);
4721
48334722 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
48344723 M->getMemOperand());
48354724 }
48364725 case Intrinsic::amdgcn_tbuffer_load: {
48374726 MemSDNode *M = cast(Op);
4727 EVT LoadVT = Op.getValueType();
4728 bool IsD16 = LoadVT.getScalarType() == MVT::f16;
4729 if (IsD16) {
4730 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG);
4731 }
4732
48384733 SDValue Ops[] = {
48394734 Op.getOperand(0), // Chain
48404735 Op.getOperand(2), // rsrc
48484743 Op.getOperand(10) // slc
48494744 };
48504745
4851 EVT VT = Op.getValueType();
4852
48534746 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
4854 Op->getVTList(), Ops, VT, M->getMemOperand());
4747 Op->getVTList(), Ops, LoadVT,
4748 M->getMemOperand());
48554749 }
48564750 case Intrinsic::amdgcn_buffer_atomic_swap:
48574751 case Intrinsic::amdgcn_buffer_atomic_add:
49324826 Op->getVTList(), Ops, VT, M->getMemOperand());
49334827 }
49344828
4829 case Intrinsic::amdgcn_image_load:
4830 case Intrinsic::amdgcn_image_load_mip: {
4831 EVT LoadVT = Op.getValueType();
4832 if ((Subtarget->hasUnpackedD16VMem() && LoadVT == MVT::v2f16) ||
4833 LoadVT == MVT::v4f16) {
4834 MemSDNode *M = cast(Op);
4835 return adjustLoadValueType(getImageOpcode(IntrID), M, DAG);
4836 }
4837
4838 return SDValue();
4839 }
4840
49354841 // Basic sample.
49364842 case Intrinsic::amdgcn_image_sample:
49374843 case Intrinsic::amdgcn_image_sample_cl:
49784884 case Intrinsic::amdgcn_image_sample_c_b_cl_o:
49794885 case Intrinsic::amdgcn_image_sample_c_lz_o:
49804886 case Intrinsic::amdgcn_image_sample_c_cd_o:
4981 case Intrinsic::amdgcn_image_sample_c_cd_cl_o: {
4887 case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
4888
4889 // Basic gather4
4890 case Intrinsic::amdgcn_image_gather4:
4891 case Intrinsic::amdgcn_image_gather4_cl:
4892 case Intrinsic::amdgcn_image_gather4_l:
4893 case Intrinsic::amdgcn_image_gather4_b:
4894 case Intrinsic::amdgcn_image_gather4_b_cl:
4895 case Intrinsic::amdgcn_image_gather4_lz:
4896
4897 // Gather4 with comparison
4898 case Intrinsic::amdgcn_image_gather4_c:
4899 case Intrinsic::amdgcn_image_gather4_c_cl:
4900 case Intrinsic::amdgcn_image_gather4_c_l:
4901 case Intrinsic::amdgcn_image_gather4_c_b:
4902 case Intrinsic::amdgcn_image_gather4_c_b_cl:
4903 case Intrinsic::amdgcn_image_gather4_c_lz:
4904
4905 // Gather4 with offsets
4906 case Intrinsic::amdgcn_image_gather4_o:
4907 case Intrinsic::amdgcn_image_gather4_cl_o:
4908 case Intrinsic::amdgcn_image_gather4_l_o:
4909 case Intrinsic::amdgcn_image_gather4_b_o:
4910 case Intrinsic::amdgcn_image_gather4_b_cl_o:
4911 case Intrinsic::amdgcn_image_gather4_lz_o:
4912
4913 // Gather4 with comparison and offsets
4914 case Intrinsic::amdgcn_image_gather4_c_o:
4915 case Intrinsic::amdgcn_image_gather4_c_cl_o:
4916 case Intrinsic::amdgcn_image_gather4_c_l_o:
4917 case Intrinsic::amdgcn_image_gather4_c_b_o:
4918 case Intrinsic::amdgcn_image_gather4_c_b_cl_o:
4919 case Intrinsic::amdgcn_image_gather4_c_lz_o: {
49824920 // Replace dmask with everything disabled with undef.
49834921 const ConstantSDNode *DMask = dyn_cast(Op.getOperand(5));
49844922 if (!DMask || DMask->isNullValue()) {
49864924 return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
49874925 }
49884926
4927 if ((Subtarget->hasUnpackedD16VMem() && Op.getValueType() == MVT::v2f16) ||
4928 Op.getValueType() == MVT::v4f16) {
4929 return adjustLoadValueType(getImageOpcode(IntrID), cast(Op),
4930 DAG);
4931 }
4932
49894933 return SDValue();
49904934 }
49914935 default:
4936 EVT LoadVT = Op.getValueType();
4937 if (LoadVT.getScalarSizeInBits() != 16)
4938 return SDValue();
4939
4940 const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr =
4941 AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrID);
4942 if (D16ImageDimIntr) {
4943 bool Unpacked = Subtarget->hasUnpackedD16VMem();
4944 MemSDNode *M = cast(Op);
4945
4946 if (isTypeLegal(LoadVT) && (!Unpacked || LoadVT == MVT::f16))
4947 return SDValue();
4948
4949 return adjustLoadValueType(D16ImageDimIntr->D16HelperIntr,
4950 M, DAG, true);
4951 }
4952
49924953 return SDValue();
49934954 }
49944955 }
49964957 SDValue SITargetLowering::handleD16VData(SDValue VData,
49974958 SelectionDAG &DAG) const {
49984959 EVT StoreVT = VData.getValueType();
4960
4961 // No change for f16 and legal vector D16 types.
4962 if (!StoreVT.isVector())
4963 return VData;
4964
49994965 SDLoc DL(VData);
5000
5001 if (StoreVT.isVector()) {
5002 assert ((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
5003 if (!Subtarget->hasUnpackedD16VMem()) {
5004 if (!isTypeLegal(StoreVT)) {
5005 // If Target supports packed vmem, we just need to workaround
5006 // the illegal type by casting to an equivalent one.
5007 EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), StoreVT);
5008 return DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData);
5009 }
5010 } else { // We need to unpack the packed data to store.
5011 EVT IntStoreVT = StoreVT.changeTypeToInteger();
5012 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
5013 EVT EquivStoreVT = (StoreVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
5014 return DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
5015 }
5016 }
5017 // No change for f16 and legal vector D16 types.
5018 return VData;
4966 assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
4967
4968 if (Subtarget->hasUnpackedD16VMem()) {
4969 // We need to unpack the packed data to store.
4970 EVT IntStoreVT = StoreVT.changeTypeToInteger();
4971 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
4972
4973 EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4974 StoreVT.getVectorNumElements());
4975 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
4976 return DAG.UnrollVectorOp(ZExt.getNode());
4977 }
4978
4979 if (isTypeLegal(StoreVT))
4980 return VData;
4981
4982 // If target supports packed vmem, we just need to workaround
4983 // the illegal type by casting to an equivalent one.
4984 EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), StoreVT);
4985 return DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData);
50194986 }
50204987
50214988 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
52065173 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
52075174 M->getMemoryVT(), M->getMemOperand());
52085175 }
5209
52105176 case Intrinsic::amdgcn_image_store:
52115177 case Intrinsic::amdgcn_image_store_mip: {
52125178 SDValue VData = Op.getOperand(2);
5213 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5214 if (IsD16)
5179 if ((Subtarget->hasUnpackedD16VMem() &&
5180 VData.getValueType() == MVT::v2f16) ||
5181 VData.getValueType() == MVT::v4f16) {
5182 SDValue Chain = Op.getOperand(0);
5183
52155184 VData = handleD16VData(VData, DAG);
5216 SDValue Ops[] = {
5217 Chain, // Chain
5218 VData, // vdata
5219 Op.getOperand(3), // vaddr
5220 Op.getOperand(4), // rsrc
5221 Op.getOperand(5), // dmask
5222 Op.getOperand(6), // glc
5223 Op.getOperand(7), // slc
5224 Op.getOperand(8), // lwe
5225 Op.getOperand(9) // da
5226 };
5227 unsigned Opc = (IntrinsicID==Intrinsic::amdgcn_image_store) ?
5228 AMDGPUISD::IMAGE_STORE : AMDGPUISD::IMAGE_STORE_MIP;
5229 MemSDNode *M = cast(Op);
5230 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5231 M->getMemoryVT(), M->getMemOperand());
5232 }
5233
5185 SDValue Ops[] = {
5186 Chain, // Chain
5187 VData, // vdata
5188 Op.getOperand(3), // vaddr
5189 Op.getOperand(4), // rsrc
5190 Op.getOperand(5), // dmask
5191 Op.getOperand(6), // glc
5192 Op.getOperand(7), // slc
5193 Op.getOperand(8), // lwe
5194 Op.getOperand(9) // da
5195 };
5196 unsigned Opc = (IntrinsicID == Intrinsic::amdgcn_image_store) ?
5197 AMDGPUISD::IMAGE_STORE : AMDGPUISD::IMAGE_STORE_MIP;
5198 MemSDNode *M = cast(Op);
5199 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5200 M->getMemoryVT(), M->getMemOperand());
5201 }
5202
5203 return SDValue();
5204 }
52345205 default: {
52355206 const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr =
52365207 AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrinsicID);
52375208 if (D16ImageDimIntr) {
52385209 SDValue VData = Op.getOperand(2);
52395210 EVT StoreVT = VData.getValueType();
5240 if ((StoreVT == MVT::v2f16 && !isTypeLegal(StoreVT)) ||
5241 StoreVT == MVT::v4f16) {
5242 VData = handleD16VData(VData, DAG);
5243
5244 SmallVector Ops;
5245 for (auto Value : Op.getNode()->op_values())
5246 Ops.push_back(Value);
5211 if (((StoreVT == MVT::v2f16 || StoreVT == MVT::v4f16) &&
5212 Subtarget->hasUnpackedD16VMem()) ||
5213 !isTypeLegal(StoreVT)) {
5214 SmallVector Ops(Op.getNode()->op_values());
5215
52475216 Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32);
5248 Ops[2] = VData;
5217 Ops[2] = handleD16VData(VData, DAG);
52495218
52505219 MemSDNode *M = cast(Op);
52515220 return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, Op->getVTList(),
5959 SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
6060 SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
6161
62 SDValue lowerIntrinsicWChain_IllegalReturnType(SDValue Op, SDValue &Chain,
63 SelectionDAG &DAG) const;
62 SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M,
63 SelectionDAG &DAG,
64 bool IsIntrinsic = false) const;
65
6466 SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const;
6567
6668 /// Converts \p Op, which must be of floating point type, to the
870870 def : ClampPat;
871871 def : ClampPat;
872872
873 let SubtargetPredicate = HasVOP3PInsts in {
873874 def : GCNPat <
874875 (v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))),
875876 (V_PK_MAX_F16 $src0_modifiers, $src0,
876877 $src0_modifiers, $src0, DSTCLAMP.ENABLE)
877878 >;
879 }
878880
879881 /********** ================================ **********/
880882 /********** Floating point absolute/negative **********/
13321334 (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)
13331335 >;
13341336
1337 let SubtargetPredicate = HasVOP3PInsts in {
13351338 def : GCNPat<
13361339 (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
13371340 (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)
13381341 >;
1342 }
13391343 }
13401344
13411345 let OtherPredicates = [NoFP32Denormals] in {
13861390 def : ExpPattern;
13871391 def : ExpPattern;
13881392
1393 // COPY_TO_REGCLASS is workaround tablegen bug from multiple outputs
1394 // from S_LSHL_B32's multiple outputs from implicit scc def.
1395 def : GCNPat <
1396 (v2i16 (build_vector (i16 0), i16:$src1)),
1397 (v2i16 (COPY_TO_REGCLASS (S_LSHL_B32 i16:$src1, (i16 16)), SReg_32_XM0))
1398 >;
1399
1400
1401 let SubtargetPredicate = HasVOP3PInsts in {
13891402 def : GCNPat <
13901403 (v2i16 (build_vector i16:$src0, i16:$src1)),
13911404 (v2i16 (S_PACK_LL_B32_B16 $src0, $src1))
13921405 >;
13931406
1394 // COPY_TO_REGCLASS is workaround tablegen bug from multiple outputs
1395 // from S_LSHL_B32's multiple outputs from implicit scc def.
1396 def : GCNPat <
1397 (v2i16 (build_vector (i16 0), i16:$src1)),
1398 (v2i16 (COPY_TO_REGCLASS (S_LSHL_B32 i16:$src1, (i16 16)), SReg_32_XM0))
1399 >;
1400
14011407 // With multiple uses of the shift, this will duplicate the shift and
14021408 // increase register pressure.
14031409 def : GCNPat <
14041410 (v2i16 (build_vector i16:$src0, (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
14051411 (v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1))
14061412 >;
1413
14071414
14081415 def : GCNPat <
14091416 (v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))),
14161423 (v2f16 (build_vector f16:$src0, f16:$src1)),
14171424 (v2f16 (S_PACK_LL_B32_B16 $src0, $src1))
14181425 >;
1426
1427 } // End SubtargetPredicate = HasVOP3PInsts
1428
14191429
14201430 // def : GCNPat <
14211431 // (v2f16 (scalar_to_vector f16:$src0)),
None ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
22
33 ; FIXME: Need to handle non-uniform case for function below (load without gep).
44 ; GCN-LABEL: {{^}}v_test_add_v2i16:
55 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
66
7 ; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
7 ; FIXME: or should be unnecessary
88 ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
9 ; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
10 ; VI: v_or_b32
911 define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
1012 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1113 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
5153 ; GCN-LABEL: {{^}}s_test_add_v2i16_kernarg:
5254 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
5355
54 ; VI: v_add_u32
55 ; VI: v_add_u32_sdwa
56 ; VI: s_add_i32
57 ; VI: s_add_i32
58 ; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
59 ; VI: s_and_b32
60 ; VI: s_or_b32
5661 define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
5762 %add = add <2 x i16> %a, %b
5863 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
5964 ret void
6065 }
6166
67 ; FIXME: Eliminate or with sdwa
6268 ; GCN-LABEL: {{^}}v_test_add_v2i16_constant:
6369 ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
6470 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
6571
72 ; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0x1c8
6673 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
67 ; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0x1c8
68 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
74 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
75 ; VI: v_or_b32_e32
6976 define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
7077 %tid = call i32 @llvm.amdgcn.workitem.id.x()
7178 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
8390
8491 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}}
8592 ; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0xfffffc21
86 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
93 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
8794 define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
8895 %tid = call i32 @llvm.amdgcn.workitem.id.x()
8996 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
98105 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, -1 op_sel_hi:[1,0]{{$}}
99106
100107 ; VI: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1
101 ; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
102 ; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
103 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD0]], v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
104 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]]
108 ; VI: flat_load_dword [[LOAD:v[0-9]+]]
109 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD]], v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
110 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD]]
105111 ; VI: v_or_b32_e32
106112 define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
107113 %tid = call i32 @llvm.amdgcn.workitem.id.x()
116122 ; GCN-LABEL: {{^}}v_test_add_v2i16_inline_lo_zero_hi:
117123 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, 32{{$}}
118124
119 ; VI-NOT: v_add_u16
125 ; VI: flat_load_dword
126 ; VI-NOT: v_add_u16
127 ; VI: v_and_b32_e32 v{{[0-9]+}}, 0xffff0000,
120128 ; VI: v_add_u16_e32 v{{[0-9]+}}, 32, v{{[0-9]+}}
121129 ; VI-NOT: v_add_u16
122 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
123130 ; VI: v_or_b32_e32
124131 define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
125132 %tid = call i32 @llvm.amdgcn.workitem.id.x()
138145
139146 ; VI-NOT: v_add_u16
140147 ; VI: v_mov_b32_e32 v[[K:[0-9]+]], 0x3f80
141 ; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
142 ; VI-NOT: v_add_u16
143 ; VI: v_or_b32_e32
148 ; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
149 ; VI-NOT: v_add_u16
150 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
144151 define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
145152 %tid = call i32 @llvm.amdgcn.workitem.id.x()
146153 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
161168 ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
162169 ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
163170
164 ; VI: flat_load_ushort v[[A_LO:[0-9]+]]
165 ; VI: flat_load_ushort v[[A_HI:[0-9]+]]
166 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
167 ; VI: flat_load_ushort v[[B_HI:[0-9]+]]
168
169 ; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
171 ; VI: flat_load_dword v[[A:[0-9]+]]
172 ; VI: flat_load_dword v[[B:[0-9]+]]
173
170174 ; VI-NOT: and
171175 ; VI-NOT: shl
172 ; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
176 ; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[A]], v[[B]]
177 ; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
173178 ; VI-NOT: and
174179 ; VI-NOT: shl
175180 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
197202 ; GFX9: buffer_store_dwordx4
198203
199204 ; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
200 ; VI: flat_load_ushort v[[A_LO:[0-9]+]]
201 ; VI: flat_load_ushort v[[A_HI:[0-9]+]]
202 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
203 ; VI: flat_load_ushort v[[B_HI:[0-9]+]]
205 ; VI: flat_load_dword v[[A:[0-9]+]]
206 ; VI: flat_load_dword v[[B:[0-9]+]]
204207
205208 ; VI-DAG: v_add_u16_e32
206 ; VI-DAG: v_add_u16_e32
209 ; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
207210
208211 ; VI: buffer_store_dwordx4
209212 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
229232 ; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
230233 ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
231234
235 ; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
232236 ; VI: v_add_u16_e32
233 ; VI: v_add_u16_e32
237
234238 ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
235239 ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
236240 ; VI: buffer_store_dwordx2
77 ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
88 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
99
10 ; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
10 ; VI: s_load_dword [[LHS:s[0-9]+]]
11 ; VI: s_load_dword [[RHS:s[0-9]+]]
12 ; VI: s_ashr_i32
13 ; VI: s_ashr_i32
14 ; VI: s_sext_i32_i16
15 ; VI: s_sext_i32_i16
16 ; VI: s_ashr_i32
17 ; VI: s_ashr_i32
18 ; VI: s_lshl_b32
19 ; VI: s_and_b32
20 ; VI: s_or_b32
1221
1322 ; CI-DAG: v_ashrrev_i32_e32
1423 ; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
7070 }
7171
7272 ; GCN-LABEL: {{^}}extract_vector_elt_v4i16:
73 ; SICIVI: buffer_load_ushort
74 ; SICIVI: buffer_load_ushort
75 ; SICIVI: buffer_store_short
76 ; SICIVI: buffer_store_short
73 ; SICI: buffer_load_ushort
74 ; SICI: buffer_load_ushort
75 ; SICI: buffer_store_short
76 ; SICI: buffer_store_short
77
78 ; VI: s_load_dword s
79 ; VI: s_load_dword s
80 ; VI: buffer_store_short
81 ; VI: buffer_store_short
7782
7883 ; GFX9-DAG: s_load_dword [[LOAD0:s[0-9]+]], s[0:1], 0x2c
7984 ; GFX9-DAG: s_load_dword [[LOAD1:s[0-9]+]], s[0:1], 0x30
9196 }
9297
9398 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:
94 ; SICIVI: buffer_load_ushort
95 ; SICIVI: buffer_load_ushort
96 ; SICIVI: buffer_load_ushort
99 ; SICI: buffer_load_ushort
100 ; SICI: buffer_load_ushort
101 ; SICI: buffer_load_ushort
102
103 ; SICI: buffer_store_short
104 ; SICI: buffer_store_short
105 ; SICI: buffer_store_short
106
107 ; SICI: buffer_load_ushort
108 ; SICI: buffer_store_short
97109
98110 ; GFX9-DAG: global_load_short_d16_hi v
99111 ; GFX9-DAG: global_load_short_d16 v
0 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
33
44 ; DAGCombiner will transform:
55 ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
3535 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
3636 ; CI: v_or_b32_e32
3737
38 ; VI: flat_load_ushort [[HI:v[0-9]+]]
39 ; VI: flat_load_ushort [[LO:v[0-9]+]]
40 ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
41 ; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[HI]], [[MASK]]
42 ; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[LO]], [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
43 ; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_LO]], [[FABS_HI]]
44 ; VI: flat_store_dword
45
46 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
47 ; GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff
38 ; GFX89: s_load_dword [[VAL:s[0-9]+]]
39 ; GFX89: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff
4840 define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
4941 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
5042 store <2 x half> %fabs, <2 x half> addrspace(1)* %out
5850 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
5951 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
6052
61 ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
62 ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
63 ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
64 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
65 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
66 ; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
67 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
53
54 ; GFX89: s_load_dword s
55 ; GFX89: s_load_dword s
56 ; GFX89: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff
57 ; GFX89: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
58 ; GFX89: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
6859
6960 ; GCN: {{flat|global}}_store_dwordx2
7061 define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
146137 ; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
147138 ; CI-DAG: v_add_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
148139
149 ; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]]
140 ; GFX89-DAG: v_mul_f16_e64 v{{[0-9]+}}, |[[VAL]]|, 4.0
150141 ; GFX89-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
151 ; GFX89-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
142 ; GFX89-DAG: v_add_f16_sdwa v{{[0-9]+}}, |[[VAL]]|, [[CONST2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
152143 define amdgpu_kernel void @v_extract_fabs_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
153144 %tid = call i32 @llvm.amdgcn.workitem.id.x()
154145 %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
166157
167158 ; GCN-LABEL: {{^}}v_extract_fabs_no_fold_v2f16:
168159 ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
160 ; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 0x7fff7fff, [[VAL]]
169161
170 ; FIXME: Extra bfe on VI
171 ; GFX9-NOT: v_bfe_u32
172 ; VI: v_bfe_u32
173 ; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 0x7fff7fff, [[VAL]]
162
163 ; VI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 15
164 ; VI: flat_store_short
165
174166 ; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[AND]], off
175167 define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
176168 %tid = call i32 @llvm.amdgcn.workitem.id.x()
221221 ret void
222222 }
223223
224 ; FIXME: Fold modifier
225224 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16:
226 ; VI-DAG: v_bfe_u32
227 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}}
228 ; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
229 ; VI: v_max_f16_e32 [[REG1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
225 ; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], |v{{[0-9]+}}|, |v{{[0-9]+}}| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
226 ; VI: v_max_f16_e64 [[REG1:v[0-9]+]], |v{{[0-9]+}}|, |v{{[0-9]+}}|
230227 ; VI-NOT: 0xffff
231228 ; VI: v_or_b32
232229
244241 }
245242
246243 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16:
247 ; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
248 ; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
249 ; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
244 ; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
245 ; VI-DAG: v_max_f16_e64 [[REG1:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}|
250246 ; VI: v_or_b32
251247
252248 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
264260 }
265261
266262 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16:
267 ; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}}
268 ; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], [[FNEG]], [[FNEG]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
269 ; VI-DAG: v_max_f16_e32 [[REG0:v[0-9]+]], [[FNEG]], [[FNEG]]
263 ; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
264 ; VI-DAG: v_max_f16_e64 [[REG0:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}}
270265 ; VI-NOT: 0xffff
271266
272267 ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} neg_lo:[1,1] neg_hi:[1,1]{{$}}
9393 ; SI-NEXT: v_max3_f32
9494 ; SI-NEXT: v_max3_f32
9595
96 ; VI: v_max_f16_e32
97 ; VI-NEXT: v_max_f16_e32
98 ; VI-NEXT: v_max_f16_e32
99 ; VI-NEXT: v_max_f16_e32
100 ; VI-NEXT: v_max_f16_e32
101 ; VI-NEXT: v_max_f16_e32
96 ; VI: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
97 ; VI: v_max_f16_e32 v0, v0, v1
98 ; VI: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
99 ; VI: v_max_f16_e32 v0, v2, v0
100 ; VI: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
101 ; VI: v_max_f16_e32 v0, v0, v3
102 ; VI: v_or_b32_e32 v0, v0, v1
102103
103104 ; GFX9: v_pk_max_f16
104105 ; GFX9-NEXT: v_pk_max_f16
9191 ; SI-NEXT: v_min3_f32
9292 ; SI-NEXT: v_min3_f32
9393
94 ; VI: v_min_f16_e32
95 ; VI-NEXT: v_min_f16_e32
96 ; VI-NEXT: v_min_f16_e32
97 ; VI-NEXT: v_min_f16_e32
98 ; VI-NEXT: v_min_f16_e32
99 ; VI-NEXT: v_min_f16_e32
94 ; VI: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
95 ; VI: v_min_f16_e32 v0, v0, v1
96 ; VI: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
97 ; VI: v_min_f16_e32 v0, v2, v0
98 ; VI: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
99 ; VI: v_min_f16_e32 v0, v0, v3
100 ; VI: v_or_b32_e32 v0, v0, v1
100101
101102 ; GFX9: v_pk_min_f16
102103 ; GFX9: v_pk_min_f16
7272 ; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, v{{[0-9]+}}
7373 ; CI: v_or_b32_e32 [[OR:v[0-9]+]], v{{[0-9]+}}, [[SHL]]
7474 ; CI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, [[OR]]
75 ; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
76 ; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
77 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
78 ; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
79 ; CIVI: flat_store_dword
8075
76 ; FIXME: Random commute
77 ; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
8178 ; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
8279 define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
8380 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
9491 ; CI: v_or_b32_e32 [[OR1:v[0-9]+]], v{{[0-9]+}}, [[SHL1]]
9592 ; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR0]]
9693 ; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR1]]
97 ; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
98 ; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
99 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
100 ; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
101 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
102 ; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
10394
104 ; GFX9: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
95 ; FIXME: Random commute
96 ; GFX89: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
97
98 ; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
99 ; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
100
105101 ; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
106102 ; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
107103
119115 ; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
120116 ; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
121117
122 ; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0
118 ; VI: v_mul_f16_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|, 4.0
123119 ; VI: v_mul_f16_sdwa v{{[0-9]+}}, -|v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
124120
125121 ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
5959 ret void
6060 }
6161
62 ; FIXME: Terrible code with VI and even worse with SI/CI
62 ; FIXME: Terrible code with SI/CI.
63 ; FIXME: scalar for VI, vector for gfx9
6364 ; GCN-LABEL: {{^}}s_fneg_v2f16:
6465 ; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
6566 ; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
6768 ; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
6869 ; CI: v_or_b32_e32
6970
70 ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x8000{{$}}
71 ; VI-DAG: v_xor_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
72 ; VI-DAG: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
71 ; VI: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
7372
7473 ; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
75
7674 define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
7775 %fneg = fsub <2 x half> , %in
7876 store <2 x half> %fneg, <2 x half> addrspace(1)* %out
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
22
33 ; half args should be promoted to float for SI and lower.
44
1212 ret void
1313 }
1414
15 ; FIXME: Should always be the same
1516 ; GCN-LABEL: {{^}}load_v2f16_arg:
16 ; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
17 ; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
18 ; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
19 ; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]]
20 ; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
21 ; GCN: s_endpgm
17 ; SI-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
18 ; SI-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
19 ; SI: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
20 ; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]]
21 ; SI: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
22
23 ; VI: s_load_dword [[ARG:s[0-9]+]]
24 ; VI: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]]
25 ; VI: buffer_store_dword [[V_ARG]]
2226 define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
2327 store <2 x half> %arg, <2 x half> addrspace(1)* %out
2428 ret void
3943 }
4044
4145 ; GCN-LABEL: {{^}}load_v4f16_arg:
42 ; GCN: buffer_load_ushort
43 ; GCN: buffer_load_ushort
44 ; GCN: buffer_load_ushort
45 ; GCN: buffer_load_ushort
46 ; GCN: buffer_store_dwordx2
47 ; GCN: s_endpgm
46 ; SI: buffer_load_ushort
47 ; SI: buffer_load_ushort
48 ; SI: buffer_load_ushort
49 ; SI: buffer_load_ushort
50 ; SI: buffer_store_dwordx2
51
52 ; FIXME: Why not one load?
53 ; VI-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
54 ; VI-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
55 ; VI-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]]
56 ; VI-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]]
57 ; VI: buffer_store_dwordx2 v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
4858 define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
4959 store <4 x half> %arg, <4 x half> addrspace(1)* %out
5060 ret void
103113 }
104114
105115 ; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
106 ; GCN: buffer_load_ushort
107 ; GCN: buffer_load_ushort
108 ; GCN: buffer_load_ushort
109 ; GCN: buffer_load_ushort
110 ; GCN: buffer_load_ushort
111 ; GCN: buffer_load_ushort
112 ; GCN: buffer_load_ushort
113 ; GCN: buffer_load_ushort
116 ; SI: buffer_load_ushort
117 ; SI: buffer_load_ushort
118 ; SI: buffer_load_ushort
119 ; SI: buffer_load_ushort
120 ; SI: buffer_load_ushort
121 ; SI: buffer_load_ushort
122 ; SI: buffer_load_ushort
123 ; SI: buffer_load_ushort
124
125
126 ; VI: s_load_dword s
127 ; VI: s_load_dword s
128 ; VI: s_load_dword s
129 ; VI: s_load_dword s
114130
115131 ; GCN: v_cvt_f32_f16_e32
116132 ; GCN: v_cvt_f32_f16_e32
144160 }
145161
146162 ; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
147 ; GCN-DAG: buffer_load_ushort v
148 ; GCN-DAG: buffer_load_ushort v
163 ; SI-DAG: buffer_load_ushort v
164 ; SI-DAG: buffer_load_ushort v
165
166 ; VI-DAG: s_load_dword s
167 ; VI: s_lshr_b32
168
149169 ; GCN-DAG: v_cvt_f32_f16_e32
150170 ; GCN-DAG: v_cvt_f32_f16_e32
151171 ; GCN-DAG: v_cvt_f64_f32_e32
175195 }
176196
177197 ; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
178 ; GCN-DAG: buffer_load_ushort v
179 ; GCN-DAG: buffer_load_ushort v
180 ; GCN-DAG: buffer_load_ushort v
181 ; GCN-DAG: buffer_load_ushort v
198 ; SI: buffer_load_ushort v
199 ; SI: buffer_load_ushort v
200 ; SI: buffer_load_ushort v
201 ; SI: buffer_load_ushort v
202
203 ; VI: s_load_dword s
204 ; VI: s_load_dword s
205
182206 ; GCN-DAG: v_cvt_f32_f16_e32
183207 ; GCN-DAG: v_cvt_f32_f16_e32
184208 ; GCN-DAG: v_cvt_f32_f16_e32
195219 }
196220
197221 ; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
198 ; GCN-DAG: buffer_load_ushort v
199 ; GCN-DAG: buffer_load_ushort v
200 ; GCN-DAG: buffer_load_ushort v
201 ; GCN-DAG: buffer_load_ushort v
202
203 ; GCN-DAG: buffer_load_ushort v
204 ; GCN-DAG: buffer_load_ushort v
205 ; GCN-DAG: buffer_load_ushort v
206 ; GCN-DAG: buffer_load_ushort v
222 ; SI: buffer_load_ushort v
223 ; SI: buffer_load_ushort v
224 ; SI: buffer_load_ushort v
225 ; SI: buffer_load_ushort v
226
227 ; SI: buffer_load_ushort v
228 ; SI: buffer_load_ushort v
229 ; SI: buffer_load_ushort v
230 ; SI: buffer_load_ushort v
231
232
233 ; VI: s_load_dword s
234 ; VI: s_load_dword s
235 ; VI: s_load_dword s
236 ; VI: s_load_dword s
237
238
207239
208240 ; GCN-DAG: v_cvt_f32_f16_e32
209241 ; GCN-DAG: v_cvt_f32_f16_e32
None ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
33 ; FIXME: Merge into imm.ll
44
55 ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
119119 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0{{$}}
120120 ; GFX9: buffer_store_dword [[REG]]
121121
122 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
123 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
124 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL0]]
122 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
123 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
125124 ; VI-DAG: v_mov_b32_e32 [[CONST0:v[0-9]+]], 0
126 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
125 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
126 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
127
128 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
129 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0
127130 ; VI: v_or_b32
128131 ; VI: buffer_store_dword
129132 define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
137140 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 op_sel_hi:[1,0]{{$}}
138141 ; GFX9: buffer_store_dword [[REG]]
139142
140 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
141 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
142 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL0]]
143 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
144 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
143145 ; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
144 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
146 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
147 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
148
149 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
150 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0.5
145151 ; VI: v_or_b32
146152 ; VI: buffer_store_dword
147153 define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
155161 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5 op_sel_hi:[1,0]{{$}}
156162 ; GFX9: buffer_store_dword [[REG]]
157163
158 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
159 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
160 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL0]]
164 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
165 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
161166 ; VI-DAG: v_mov_b32_e32 [[CONSTM05:v[0-9]+]], 0xb800
162 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
167 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
168 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
169
170 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
171 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -0.5
163172 ; VI: v_or_b32
164173 ; VI: buffer_store_dword
165174 define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
173182 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1.0 op_sel_hi:[1,0]{{$}}
174183 ; GFX9: buffer_store_dword [[REG]]
175184
176 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
177 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
178 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL0]]
185 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
186 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
179187 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0x3c00
180 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
188 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
189 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
190
191 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
192 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1.0
181193 ; VI: v_or_b32
182194 ; VI: buffer_store_dword
183195 define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
191203 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1.0 op_sel_hi:[1,0]{{$}}
192204 ; GFX9: buffer_store_dword [[REG]]
193205
194 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
195 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
196 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL0]]
197 ; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00
198 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
206
207 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
208 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
209 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0xbc00
210 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
211 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
212
213 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
214 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -1.0
199215 ; VI: v_or_b32
200216 ; VI: buffer_store_dword
201217 define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
209225 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2.0 op_sel_hi:[1,0]{{$}}
210226 ; GFX9: buffer_store_dword [[REG]]
211227
212 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
213 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
214 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL0]]
228 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
229 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
215230 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
216 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
231 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
232 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
233
234 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
235 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2.0
217236 ; VI: v_or_b32
218237 ; VI: buffer_store_dword
219238 define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
227246 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2.0 op_sel_hi:[1,0]{{$}}
228247 ; GFX9: buffer_store_dword [[REG]]
229248
230 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
231 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
232 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL0]]
249 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
250 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
233251 ; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xc000
234 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
252 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
253 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
254
255 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
256 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -2.0
235257 ; VI: v_or_b32
236258 ; VI: buffer_store_dword
237259 define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
245267 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 4.0 op_sel_hi:[1,0]{{$}}
246268 ; GFX9: buffer_store_dword [[REG]]
247269
248 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
249 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
250 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL0]]
270 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
271 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
251272 ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
252 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
273 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
274 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
275
276 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
277 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 4.0
253278 ; VI: v_or_b32
254279 ; VI: buffer_store_dword
255280 define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
263288 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -4.0 op_sel_hi:[1,0]{{$}}
264289 ; GFX9: buffer_store_dword [[REG]]
265290
266 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
267 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
268 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL0]]
291 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
292 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
269293 ; VI-DAG: v_mov_b32_e32 [[CONSTM4:v[0-9]+]], 0xc400
270 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
294 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
295 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
296
297 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
298 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -4.0
271299 ; VI: v_or_b32
272300 ; VI: buffer_store_dword
273301 define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
320348 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1 op_sel_hi:[1,0]{{$}}
321349 ; GFX9: buffer_store_dword [[REG]]
322350
323 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
324 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
325 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL0]]
326 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1
327 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
351 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
352 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
353 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1{{$}}
354 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
355 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
356
357 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
358 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1{{$}}
328359 ; VI: v_or_b32
329360 ; VI: buffer_store_dword
330361 define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
338369 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2 op_sel_hi:[1,0]{{$}}
339370 ; GFX9: buffer_store_dword [[REG]]
340371
341 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
342 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
343 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL0]]
344 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2
345 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
372
373 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
374 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
375 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2{{$}}
376 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
377 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
378
379 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
380 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2{{$}}
346381 ; VI: v_or_b32
347382 ; VI: buffer_store_dword
348383 define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
356391 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 16 op_sel_hi:[1,0]{{$}}
357392 ; GFX9: buffer_store_dword [[REG]]
358393
359 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
360 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
361 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL0]]
362 ; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16
363 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
394
395 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
396 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
397 ; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16{{$}}
398 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
399 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
400
401 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
402 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 16{{$}}
364403 ; VI: v_or_b32
365404 ; VI: buffer_store_dword
366405 define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
374413 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
375414 ; GFX9: buffer_store_dword [[REG]]
376415
377 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
378 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
379 ; VI: v_or_b32_e32 [[REG:v[0-9]+]]
380 ; VI: v_add_u32_e32 [[REG]], vcc, -1, [[REG]]
416 ; VI: s_load_dword [[VAL:s[0-9]+]]
417 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], -1{{$}}
418 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
381419 ; VI: buffer_store_dword [[REG]]
382420 define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
383421 %xbc = bitcast <2 x half> %x to i32
392430 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
393431 ; GFX9: buffer_store_dword [[REG]]
394432
395 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
396 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
397 ; VI: v_or_b32_e32 [[REG:v[0-9]+]]
398 ; VI: v_add_u32_e32 [[REG]], vcc, 0xfffefffe, [[REG]]
433 ; VI: s_load_dword [[VAL:s[0-9]+]]
434 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfffefffe{{$}}
435 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
399436 ; VI: buffer_store_dword [[REG]]
400437 define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
401438 %xbc = bitcast <2 x half> %x to i32
410447 ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]]
411448 ; GFX9: buffer_store_dword [[REG]]
412449
413 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
414 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
415 ; VI: v_or_b32_e32 [[REG:v[0-9]+]]
416 ; VI: v_add_u32_e32 [[REG]], vcc, 0xfff0fff0, [[REG]]
450
451 ; VI: s_load_dword [[VAL:s[0-9]+]]
452 ; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfff0fff0{{$}}
453 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
417454 ; VI: buffer_store_dword [[REG]]
418455 define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
419456 %xbc = bitcast <2 x half> %x to i32
428465 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 63
429466 ; GFX9: buffer_store_dword [[REG]]
430467
431 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
432 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
433 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL0]]
468 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
469 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
434470 ; VI-DAG: v_mov_b32_e32 [[CONST63:v[0-9]+]], 63
435 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
471 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
472 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
473
474 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
475 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 63
436476 ; VI: v_or_b32
437477 ; VI: buffer_store_dword
438478 define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
446486 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 64
447487 ; GFX9: buffer_store_dword [[REG]]
448488
449 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
450 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
451 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL0]]
489 ; FIXME: Shouldn't need right shift and SDWA, also extra copy
490 ; VI-DAG: s_load_dword [[VAL:s[0-9]+]]
452491 ; VI-DAG: v_mov_b32_e32 [[CONST64:v[0-9]+]], 64
453 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
492 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16
493 ; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]]
494
495 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
496 ; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 64
454497 ; VI: v_or_b32
455498 ; VI: buffer_store_dword
456499 define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
4141 ret void
4242 }
4343
44 ; GCN: error: couldn't allocate output register for constraint 's'
45 ; GCN: error: couldn't allocate input reg for constraint 's'
44 ; CI: error: couldn't allocate output register for constraint 's'
45 ; CI: error: couldn't allocate input reg for constraint 's'
46
47 ; VI-NOT: error
4648 define amdgpu_kernel void @s_input_output_v2f16() {
4749 %v = tail call <2 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"()
4850 tail call void asm sideeffect "; use $0", "s"(<2 x half> %v)
4951 ret void
5052 }
5153
52 ; GCN: error: couldn't allocate output register for constraint 'v'
53 ; GCN: error: couldn't allocate input reg for constraint 'v'
54 ; CI: error: couldn't allocate output register for constraint 'v'
55 ; CI: error: couldn't allocate input reg for constraint 'v'
56 ; VI-NOT: error
5457 define amdgpu_kernel void @v_input_output_v2f16() {
5558 %v = tail call <2 x half> asm sideeffect "v_mov_b32 $0, -1", "=v"()
5659 tail call void asm sideeffect "; use $0", "v"(<2 x half> %v)
6669 ret void
6770 }
6871
69 ; GCN: error: couldn't allocate output register for constraint 's'
70 ; GCN: error: couldn't allocate input reg for constraint 's'
72 ; FIXME: Should work on all targets?
73
74 ; CI: error: couldn't allocate output register for constraint 's'
75 ; CI: error: couldn't allocate input reg for constraint 's'
76
77 ; VI-NOT: error
7178 define amdgpu_kernel void @s_input_output_v2i16() {
7279 %v = tail call <2 x i16> asm sideeffect "s_mov_b32 $0, -1", "=s"()
7380 tail call void asm sideeffect "; use $0", "s"(<2 x i16> %v)
None ; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tahiti -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=GCN-NO-TONGA %s
1 ; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=GCN-TONGA %s
0 ; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tahiti -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s
1 ; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s
22
33 ; FIXME: Broken on evergreen
44 ; FIXME: For some reason the 8 and 16 vectors are being stored as
None ; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
1 ; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s
2 ; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s
0 ; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
1 ; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s
2 ; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
33
44 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0:
55 ; GCN: s_load_dword [[VEC:s[0-9]+]]
3838 ; GCN: s_load_dword [[ELT0:s[0-9]+]]
3939 ; GCN: s_load_dword [[VEC:s[0-9]+]]
4040
41 ; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
42 ; CIVI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
43 ; CIVI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16
44 ; CIVI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
45 ; CIVI-DAG: ; use [[SHR]]
41 ; CI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
42 ; CI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
43 ; CI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16
44 ; CI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
45 ; CI-DAG: ; use [[SHR]]
46
47
48 ; FIXME: Should be able to void mask of upper bits
49 ; VI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
50 ; VI-DAG: s_and_b32 [[VEC_HIMASK:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
51 ; VI: s_or_b32 [[OR:s[0-9]+]], [[ELT0]], [[VEC_HIMASK]]
52 ; VI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
53
54 ; VI-DAG: ; use [[SHR]]
55
4656
4757 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
4858 ; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
102112 ; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
103113 ; GCN: s_load_dword [[VEC:s[0-9]+]],
104114
105 ; CIVI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
106 ; CIVI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
107 ; CIVI-DAG: s_lshl_b32 [[VEC_HI:s[0-9]+]], [[SHR]], 16
108 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]]
115 ; CI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
116 ; CI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
117 ; CI-DAG: s_lshl_b32 [[VEC_HI:s[0-9]+]], [[SHR]], 16
118 ; CI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]]
119
120
121 ; VI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
122 ; VI-DAG: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[VEC]], 16
123 ; VI: s_and_b32 [[MASK_HI:s[0-9]+]], [[VEC]], 0xffff0000
124 ; VI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[MASK_HI]]
109125
110126 ; GFX9-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
111127 ; GFX9-DAG: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[VEC]], 16
None ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefixes=SI,GCN,MESA-GCN,FUNC
1 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC
2 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,HSA-VI,FUNC
3 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC
4 ; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC
0 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s
1 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s
2 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
3 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=EG --check-prefix=FUNC %s
4 ; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s
55
66 ; FUNC-LABEL: {{^}}i8_arg:
77 ; HSA-VI: kernarg_segment_alignment = 4
161161 ; HSA-VI: kernarg_segment_alignment = 4
162162 ; EG: VTX_READ_16
163163 ; EG: VTX_READ_16
164 ; MESA-GCN: buffer_load_ushort
165 ; MESA-GCN: buffer_load_ushort
166 ; HSA-VI: flat_load_ushort
167 ; HSA-VI: flat_load_ushort
164
165 ; SI: buffer_load_ushort
166 ; SI: buffer_load_ushort
167
168 ; VI: s_load_dword s
168169 define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
169170 entry:
170171 store <2 x i16> %in, <2 x i16> addrspace(1)* %out
284285 ; EG: VTX_READ_16
285286 ; EG: VTX_READ_16
286287 ; EG: VTX_READ_16
287 ; MESA-GCN: buffer_load_ushort
288 ; MESA-GCN: buffer_load_ushort
289 ; MESA-GCN: buffer_load_ushort
290 ; MESA-GCN: buffer_load_ushort
291 ; HSA-GCN: flat_load_ushort
292 ; HSA-GCN: flat_load_ushort
293 ; HSA-GCN: flat_load_ushort
294 ; HSA-GCN: flat_load_ushort
288
289 ; SI: buffer_load_ushort
290 ; SI: buffer_load_ushort
291 ; SI: buffer_load_ushort
292 ; SI: buffer_load_ushort
293
294 ; VI: s_load_dword s
295 ; VI: s_load_dword s
295296 define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
296297 entry:
297298 store <4 x i16> %in, <4 x i16> addrspace(1)* %out
304305 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
305306 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
306307 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
308
307309 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
308310 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
309311 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
369371 ; EG: VTX_READ_16
370372 ; EG: VTX_READ_16
371373 ; EG: VTX_READ_16
372 ; MESA-GCN: buffer_load_ushort
373 ; MESA-GCN: buffer_load_ushort
374 ; MESA-GCN: buffer_load_ushort
375 ; MESA-GCN: buffer_load_ushort
376 ; MESA-GCN: buffer_load_ushort
377 ; MESA-GCN: buffer_load_ushort
378 ; MESA-GCN: buffer_load_ushort
379 ; MESA-GCN: buffer_load_ushort
380 ; HSA-VI: flat_load_ushort
381 ; HSA-VI: flat_load_ushort
382 ; HSA-VI: flat_load_ushort
383 ; HSA-VI: flat_load_ushort
384 ; HSA-VI: flat_load_ushort
385 ; HSA-VI: flat_load_ushort
386 ; HSA-VI: flat_load_ushort
387 ; HSA-VI: flat_load_ushort
374
375 ; SI: buffer_load_ushort
376 ; SI: buffer_load_ushort
377 ; SI: buffer_load_ushort
378 ; SI: buffer_load_ushort
379 ; SI: buffer_load_ushort
380 ; SI: buffer_load_ushort
381 ; SI: buffer_load_ushort
382 ; SI: buffer_load_ushort
383
384 ; VI: s_load_dword s
385 ; VI: s_load_dword s
386 ; VI: s_load_dword s
387 ; VI: s_load_dword s
388388 define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
389389 entry:
390390 store <8 x i16> %in, <8 x i16> addrspace(1)* %out
501501 ; EG: VTX_READ_16
502502 ; EG: VTX_READ_16
503503 ; EG: VTX_READ_16
504 ; MESA-GCN: buffer_load_ushort
505 ; MESA-GCN: buffer_load_ushort
506 ; MESA-GCN: buffer_load_ushort
507 ; MESA-GCN: buffer_load_ushort
508 ; MESA-GCN: buffer_load_ushort
509 ; MESA-GCN: buffer_load_ushort
510 ; MESA-GCN: buffer_load_ushort
511 ; MESA-GCN: buffer_load_ushort
512 ; MESA-GCN: buffer_load_ushort
513 ; MESA-GCN: buffer_load_ushort
514 ; MESA-GCN: buffer_load_ushort
515 ; MESA-GCN: buffer_load_ushort
516 ; MESA-GCN: buffer_load_ushort
517 ; MESA-GCN: buffer_load_ushort
518 ; MESA-GCN: buffer_load_ushort
519 ; MESA-GCN: buffer_load_ushort
520 ; HSA-VI: flat_load_ushort
521 ; HSA-VI: flat_load_ushort
522 ; HSA-VI: flat_load_ushort
523 ; HSA-VI: flat_load_ushort
524 ; HSA-VI: flat_load_ushort
525 ; HSA-VI: flat_load_ushort
526 ; HSA-VI: flat_load_ushort
527 ; HSA-VI: flat_load_ushort
528 ; HSA-VI: flat_load_ushort
529 ; HSA-VI: flat_load_ushort
530 ; HSA-VI: flat_load_ushort
531 ; HSA-VI: flat_load_ushort
532 ; HSA-VI: flat_load_ushort
533 ; HSA-VI: flat_load_ushort
534 ; HSA-VI: flat_load_ushort
535 ; HSA-VI: flat_load_ushort
504
505 ; SI: buffer_load_ushort
506 ; SI: buffer_load_ushort
507 ; SI: buffer_load_ushort
508 ; SI: buffer_load_ushort
509 ; SI: buffer_load_ushort
510 ; SI: buffer_load_ushort
511 ; SI: buffer_load_ushort
512 ; SI: buffer_load_ushort
513 ; SI: buffer_load_ushort
514 ; SI: buffer_load_ushort
515 ; SI: buffer_load_ushort
516 ; SI: buffer_load_ushort
517 ; SI: buffer_load_ushort
518 ; SI: buffer_load_ushort
519 ; SI: buffer_load_ushort
520 ; SI: buffer_load_ushort
521
522 ; VI: s_load_dword s
523 ; VI: s_load_dword s
524 ; VI: s_load_dword s
525 ; VI: s_load_dword s
526 ; VI: s_load_dword s
527 ; VI: s_load_dword s
528 ; VI: s_load_dword s
529 ; VI: s_load_dword s
536530 define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
537531 entry:
538532 store <16 x i16> %in, <16 x i16> addrspace(1)* %out
None ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
0 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
11 ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s
22 ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s
33
1212
1313 ; GCN-LABEL: {{^}}buffer_store_format_d16_xy:
1414
15 ; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ;
16 ; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ;
17 ; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
15 ; UNPACKED: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
16 ; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16
17 ; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}}
18 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
19 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
20 ; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
1821
1922 ; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
2023 define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %index) {
2528
2629 ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
2730
28 ; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ;
29 ; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ;
30 ; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ;
31 ; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ;
31 ; UNPACKED-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
32 ; UNPACKED-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x38
33
34 ; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
35 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], [[S_DATA_0]], 16
36 ; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], [[S_DATA_0]], [[K]]
37 ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], [[S_DATA_1]], 16
38 ; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], [[S_DATA_1]], [[K]]
39
40 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
41 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
42
3243 ; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
3344
34 ; GFX81: v_or_b32_e32 v[[HI:[0-9]+]]
35 ; GFX81: v_or_b32_e32 v[[LO:[0-9]+]]
3645
37 ; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]]
38 ; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]]
46
47 ; PACKED-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
48 ; PACKED-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x38
49
50 ; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], [[S_DATA_0]]
51 ; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[S_DATA_1]]
3952
4053 ; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
4154 define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) {
None ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s
1 ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s
2 ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s
0 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
1 ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s
2 ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s
33
44 ; GCN-LABEL: {{^}}image_load_f16
55 ; GCN: image_load v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 unorm d16
5757 ret void
5858 }
5959
60 ; GCN-LABEL: {{^}}image_store_v2f16
60 ; FIXME: Eliminate and to get low bits
61 ; GCN-LABEL: {{^}}image_store_v2f16:
62 ; UNPACKED: s_load_dword [[DATA:s[0-9]+]]
63 ; UNPACKED-DAG: s_lshr_b32 [[UNPACK_1:s[0-9]+]], [[DATA]], 16
64 ; UNPACKED-DAG: s_and_b32 [[UNPACK_0:s[0-9]+]], [[DATA]], 0xffff
65 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_UNPACK_0:[0-9]+]], [[UNPACK_0]]
66 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_UNPACK_1:[0-9]+]], [[UNPACK_1]]
6167
62 ; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}}
63 ; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}}
64 ; UNPACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16
68
69
70 ; UNPACKED: image_store v{{\[}}[[V_UNPACK_0]]:[[V_UNPACK_1]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16
6571
6672 ; PACKED: image_store v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16
6773 define amdgpu_kernel void @image_store_v2f16(<2 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) {
7177 }
7278
7379 ; GCN-LABEL: {{^}}image_store_v4f16
80 ; UNPACKED: s_load_dword s
81 ; UNPACKED: s_load_dword s
82 ; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
83 ; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
84 ; UNPACKED: s_and_b32
85 ; UNPACKED: s_and_b32
86 ; UNPACKED: image_store v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
7487
75 ; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}}
76 ; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}}
77 ; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}}
78 ; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}}
79 ; UNPACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
80
81 ; GFX81: v_or_b32_e32 v[[HI:[0-9]+]]
82 ; GFX81: v_or_b32_e32 v[[LO:[0-9]+]]
83
84 ; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]]
85 ; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]]
86
87 ; PACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
88 ; PACKED: s_load_dword [[DATA0:s[0-9]+]]
89 ; PACKED: s_load_dword [[DATA1:s[0-9]+]]
90 ; PACKED: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[DATA0]]
91 ; PACKED: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[DATA1]]
92 ; PACKED: image_store v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
8893 define amdgpu_kernel void @image_store_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) {
8994 main_body:
9095 call void @llvm.amdgcn.image.store.v4f16.v4i32.v8i32(<4 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
9297 }
9398
9499 ; GCN-LABEL: {{^}}image_store_mip_v4f16
100 ; UNPACKD: s_load_dword s
101 ; UNPACKD: s_load_dword s
102 ; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
103 ; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
104 ; UNPACKED: s_and_b32
105 ; UNPACKED: s_and_b32
106 ; UNPACKED: image_store_mip v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
95107
96 ; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}}
97 ; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}}
98 ; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}}
99 ; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}}
100 ; UNPACKED: image_store_mip v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
101
102 ; GFX81: v_or_b32_e32 v[[HI:[0-9]+]]
103 ; GFX81: v_or_b32_e32 v[[LO:[0-9]+]]
104
105 ; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]]
106 ; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]]
107
108 ; PACKED: image_store_mip v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
108 ; PACKED: s_load_dword [[DATA0:s[0-9]+]]
109 ; PACKED: s_load_dword [[DATA1:s[0-9]+]]
110 ; PACKED: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[DATA0]]
111 ; PACKED: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[DATA1]]
112 ; PACKED: image_store_mip v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
109113 define amdgpu_kernel void @image_store_mip_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) {
110114 main_body:
111115 call void @llvm.amdgcn.image.store.mip.v4f16.v4i32.v8i32(<4 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
0 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
12 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
23
34 ; GCN-LABEL: {{^}}load_1d:
None ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
0 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
11 ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s
22 ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s
33
1111 ret void
1212 }
1313
14
1514 ; GCN-LABEL: {{^}}tbuffer_store_d16_xy:
16
17 ; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ;
18 ; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ;
19 ; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
15 ; GCN: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
16 ; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16
17 ; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}}
18 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
19 ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
20 ; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
2021
2122 ; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
2223 define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) {
2526 ret void
2627 }
2728
29 ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
30 ; GCN-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
31 ; GCN-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x38
2832
29 ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
33 ; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
34 ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], [[S_DATA_0]], 16
35 ; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], [[S_DATA_0]], [[K]]
36 ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], [[S_DATA_1]], 16
37 ; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], [[S_DATA_1]], [[K]]
3038
31 ; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ;
32 ; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ;
33 ; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ;
34 ; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ;
39 ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
40 ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
3541 ; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
3642
37 ; GFX81: v_or_b32_e32 v[[HI:[0-9]+]]
38 ; GFX81: v_or_b32_e32 v[[LO:[0-9]+]]
3943
40 ; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]]
41 ; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]]
42
44 ; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], [[S_DATA_0]]
45 ; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[S_DATA_1]]
4346 ; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
4447 define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) {
4548 main_body:
144144 }
145145
146146 ; GCN-LABEL: {{^}}fma_v2f16_imm_a:
147 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
148 ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
147 ; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
148 ; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
149
150 ; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
151 ; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
152
149153
150154 ; SI: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}}
151155 ; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}}
184188 ; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
185189 ; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
186190
191 ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
187192 ; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
188 ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
189193
190194 ; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}}
191195 ; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
227231 ; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
228232 ; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
229233
234 ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
230235 ; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
231 ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
232236
233237 ; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}}
234238 ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
None ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
0 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s
33
44 ; GCN-LABEL: {{^}}s_lshr_v2i16:
55 ; GFX9: s_load_dword [[LHS:s[0-9]+]]
77 ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
88 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
99
10 ; VI-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10
11 ; VI: s_load_dword [[LHS:s[0-9]+]]
12 ; VI: s_load_dword [[RHS:s[0-9]+]]
13 ; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
14 ; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
15 ; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
16 ; VI-DAG: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16
17 ; VI-DAG: s_lshl_b32
18 ; VI: v_or_b32_e32
19
1120 ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
12 ; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
13 ; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
14 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
21 ; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
22 ; CI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
23 ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
1524 define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
1625 %result = lshr <2 x i16> %lhs, %rhs
1726 store <2 x i16> %result, <2 x i16> addrspace(1)* %out
116116 ; SI: v_min_i32
117117 ; SI: v_min_i32
118118
119 ; VI: v_min_i32
120 ; VI: v_min_i32
119 ; VI: s_sext_i32_i16
120 ; VI: s_sext_i32_i16
121 ; VI: s_min_i32
122 ; VI: s_min_i32
121123
122124 ; GFX9: v_pk_min_i16
123125
130132 ret void
131133 }
132134
133 ; FIXME: VI use s_min_i32
134135 ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16:
135136 ; SI: v_min_i32
136137 ; SI: v_min_i32
137138 ; SI: v_min_i32
138139 ; SI: v_min_i32
139140
140 ; VI: v_min_i32
141 ; VI: v_min_i32
142 ; VI: v_min_i32
143 ; VI: v_min_i32
141 ; VI: s_min_i32
142 ; VI: s_min_i32
143 ; VI: s_min_i32
144 ; VI: s_min_i32
144145
145146 ; GFX9: v_pk_min_i16
146147 ; GFX9: v_pk_min_i16
460461 ; SI: v_min_u32
461462 ; SI: v_min_u32
462463
463 ; VI: v_min_u32
464 ; VI: v_min_u32
465 ; VI: v_min_u32
466 ; VI: v_min_u32
467 ; VI: v_min_u32
468 ; VI: v_min_u32
469 ; VI: v_min_u32
470 ; VI: v_min_u32
464 ; VI: s_min_u32
465 ; VI: s_min_u32
466 ; VI: s_min_u32
467 ; VI: s_min_u32
468 ; VI: s_min_u32
469 ; VI: s_min_u32
470 ; VI: s_min_u32
471 ; VI: s_min_u32
471472
472473 ; EG: MIN_UINT
473474 ; EG: MIN_UINT
44 ; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
55 ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
66
7 ; VI: v_add_f16_e32
7 ; VI: v_add_f16_sdwa
88 ; VI-NEXT: v_add_f16_e32
99 ; VI-NEXT: v_add_f16_e32
1010 define half @reduction_half4(<4 x half> %vec4) {
2121 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
2222 ; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2323
24 ; VI: v_add_u16_e32
24 ; VI: v_add_u16_sdwa
2525 ; VI-NEXT: v_add_u16_e32
2626 ; VI-NEXT: v_add_u16_e32
2727 define i16 @reduction_v4i16(<4 x i16> %vec4) {
4040 ; GFX9-NEXT: v_pk_add_f16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}}
4141 ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4242
43 ; VI: v_add_f16_e32
44 ; VI-NEXT: v_add_f16_e32
43 ; VI: v_add_f16_sdwa
44 ; VI-NEXT: v_add_f16_sdwa
4545 ; VI-NEXT: v_add_f16_e32
4646 ; VI-NEXT: v_add_f16_e32
4747 ; VI-NEXT: v_add_f16_e32
6666 ; GFX9-NEXT: v_pk_add_u16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}}
6767 ; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
6868
69 ; VI: v_add_u16_e32
70 ; VI-NEXT: v_add_u16_e32
69 ; VI: v_add_u16_sdwa
70 ; VI-NEXT: v_add_u16_sdwa
7171 ; VI-NEXT: v_add_u16_e32
7272 ; VI-NEXT: v_add_u16_e32
7373 ; VI-NEXT: v_add_u16_e32
9696 ; GFX9-NEXT: v_pk_add_f16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}}
9797 ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9898
99 ; VI: v_add_f16_e32
100 ; VI-NEXT: v_add_f16_e32
101 ; VI-NEXT: v_add_f16_e32
102 ; VI-NEXT: v_add_f16_e32
99 ; VI: v_add_f16_sdwa
100 ; VI-NEXT: v_add_f16_sdwa
101 ; VI-NEXT: v_add_f16_sdwa
102 ; VI-NEXT: v_add_f16_sdwa
103103 ; VI-NEXT: v_add_f16_e32
104104 ; VI-NEXT: v_add_f16_e32
105105 ; VI-NEXT: v_add_f16_e32
130130 ; GFX9: v_pk_min_u16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
131131 ; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
132132
133 ; VI: v_min_u16_e32
133 ; VI: v_min_u16_sdwa
134134 ; VI-NEXT: v_min_u16_e32
135135 ; VI-NEXT: v_min_u16_e32
136136 define i16 @reduction_min_v4i16(<4 x i16> %vec4) {
151151 ; GFX9-NEXT: v_pk_min_u16 [[MIN3:v[0-9]+]], [[MIN2]], [[MIN1]]{{$}}
152152 ; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN3]], [[MIN3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
153153
154 ; VI: v_min_u16_e32
155 ; VI-NEXT: v_min_u16_e32
154 ; VI: v_min_u16_sdwa
155 ; VI-NEXT: v_min_u16_sdwa
156156 ; VI-NEXT: v_min_u16_e32
157157 ; VI-NEXT: v_min_u16_e32
158158 ; VI-NEXT: v_min_u16_e32
223223 ; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
224224 ; GFX9-NEXT: v_min_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
225225
226 ; VI: v_min_i16_e32
227 ; VI-NEXT: v_min_i16_e32
228 ; VI-NEXT: v_min_i16_e32
229 ; VI-NEXT: v_min_i16_e32
226 ; VI: v_min_i16_sdwa
227 ; VI-NEXT: v_min_i16_sdwa
228 ; VI-NEXT: v_min_i16_sdwa
229 ; VI-NEXT: v_min_i16_sdwa
230230 ; VI-NEXT: v_min_i16_e32
231231 ; VI-NEXT: v_min_i16_e32
232232 ; VI-NEXT: v_min_i16_e32
338338 ; GFX9: v_pk_max_u16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
339339 ; GFX9-NEXT: v_max_u16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
340340
341 ; VI: v_max_u16_e32
341 ; VI: v_max_u16_sdwa
342342 ; VI-NEXT: v_max_u16_e32
343343 ; VI-NEXT: v_max_u16_e32
344344 define i16 @reduction_umax_v4i16(<4 x i16> %vec4) {
357357 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
358358 ; GFX9-NEXT: v_max_i16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
359359
360 ; VI: v_max_i16_e32
360 ; VI: v_max_i16_sdwa
361361 ; VI-NEXT: v_max_i16_e32
362362 ; VI-NEXT: v_max_i16_e32
363363 define i16 @reduction_smax_v4i16(<4 x i16> %vec4) #0 {
376376 ; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
377377 ; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
378378
379 ; VI: v_max_f16_e32
379 ; VI: v_max_f16_sdwa
380380 ; VI-NEXT: v_max_f16_e32
381381 ; VI-NEXT: v_max_f16_e32
382382 define half @reduction_fmax_v4half(<4 x half> %vec4) {
395395 ; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
396396 ; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
397397
398 ; VI: v_min_f16_e32
398 ; VI: v_min_f16_sdwa
399399 ; VI-NEXT: v_min_f16_e32
400400 ; VI-NEXT: v_min_f16_e32
401401 define half @reduction_fmin_v4half(<4 x half> %vec4) {
408408 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1
409409 %res = extractelement <4 x half> %rdx.minmax.select3, i32 0
410410 ret half %res
411 }
411 }
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=NOSDWA,GCN %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,SDWA,GCN %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,SDWA,GCN %s
33
44 ; GCN-LABEL: {{^}}add_shr_i32:
55 ; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
None ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
33
44 ; Test expansion of scalar selects on vectors.
55 ; Evergreen not enabled since it seems to be having problems with doubles.
7575 }
7676
7777 ; GCN-LABEL: {{^}}select_v2i16:
78 ; GCN: v_cndmask_b32_e32
79 ; GCN-NOT: v_cndmask_b32
78 ; GFX89: s_load_dword
79 ; GFX89: s_load_dword
80 ; GFX89: s_load_dword
81 ; GFX89: v_cndmask_b32
82 ; GFX89-NOT: v_cndmask_b32
83
84 ; SI: v_cndmask_b32_e32
85 ; SI-NOT: v_cndmask_b32e
8086 define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
8187 %cmp = icmp eq i32 %c, 0
8288 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b
8591 }
8692
8793 ; GCN-LABEL: {{^}}v_select_v2i16:
88 ; GCN: v_cndmask_b32_e32
94 ; GCN: buffer_load_dword v
95 ; GCN: buffer_load_dword v
96 ; GCN: v_cndmask_b32
8997 ; GCN-NOT: cndmask
9098 define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
9199 %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.ptr
329337 }
330338
331339 ; GCN-LABEL: {{^}}v_select_v2f16:
332 ; GCN: v_cndmask_b32_e32
340 ; GCN: v_cndmask_b32
333341 ; GCN-NOT: cndmask
334342 define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %a.ptr, <2 x half> addrspace(1)* %b.ptr, i32 %c) #0 {
335343 %a = load <2 x half>, <2 x half> addrspace(1)* %a.ptr
None ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
0 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s
33
44 ; GCN-LABEL: {{^}}s_shl_v2i16:
55 ; GFX9: s_load_dword [[LHS:s[0-9]+]]
77 ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
88 ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
99
10 ; VI: v_lshlrev_b32_e32
11 ; VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
10 ; VI: s_load_dword s
11 ; VI: s_load_dword s
12 ; VI: s_lshr_b32
13 ; VI: s_lshr_b32
14 ; VI: s_and_b32
15 ; VI: s_and_b32
16 ; SI: s_and_B32
17 ; SI: s_or_b32
1318
1419 ; CI-DAG: v_lshlrev_b32_e32
1520 ; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
22
33 ; GCN-LABEL: {{^}}s_sext_i1_to_i32:
44 ; GCN: v_cndmask_b32_e64
176176 ret void
177177 }
178178
179 ; FIXME: s_bfe_i64
179 ; FIXME: s_bfe_i64, same on SI and VI
180180 ; GCN-LABEL: {{^}}s_sext_v4i16_to_v4i32:
181 ; GCN-DAG: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 48
182 ; GCN-DAG: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16
181 ; SI-DAG: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 48
182 ; SI-DAG: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16
183
184 ; VI: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16
185 ; VI: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16
186
187
183188 ; GCN-DAG: s_sext_i32_i16
184189 ; GCN-DAG: s_sext_i32_i16
185190 ; GCN: s_endpgm
198203 }
199204
200205 ; GCN-LABEL: {{^}}v_sext_v4i16_to_v4i32:
201 ; SI-DAG: v_ashr_i64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 48
202 ; VI-DAG: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 48, v{{\[[0-9]+:[0-9]+\]}}
203206 ; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
204207 ; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
205208 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GCN %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,CIVI,GCN %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI,CIVI,GCN %s
33
44 ; GCN-LABEL: {{^}}s_abs_v2i16:
55 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
77 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
88 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
99
10 ; VI: v_sub_u32_e32
11 ; VI-DAG: v_sub_u32_e32
12 ; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
13 ; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
14 ; VI: v_add_u32_e32
15 ; VI: v_add_u32_e32
16 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
10 ; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
11 ; VI: s_sub_i32
12 ; VI: s_sub_i32
13 ; VI: s_max_i32
14 ; VI: s_max_i32
15 ; SI: s_add_i32
16 ; SI: s_add_i32
17 ; SI: s_and_b32
18 ; SI: s_or_b32
1719
1820 ; CI: v_sub_i32_e32
1921 ; CI-DAG: v_sub_i32_e32
None ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
0 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX89,GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GFX89,GCN %s
22
33 ; FIXME: Need to handle non-uniform case for function below (load without gep).
44 ; GCN-LABEL: {{^}}v_test_sub_v2i16:
5 ; GFX89: {{flat|global}}_load_dword
6 ; GFX89: {{flat|global}}_load_dword
7
58 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
69
7 ; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
8 ; VI: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
10 ; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
11 ; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
912 define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
1013 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1114 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
4649
4750 ; FIXME: VI should not scalarize arg access.
4851 ; GCN-LABEL: {{^}}s_test_sub_v2i16_kernarg:
52 ; GCN: s_load_dword s
53 ; GCN: s_load_dword s
54
4955 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
5056
51 ; VI: v_subrev_u32_e32
52 ; VI: v_subrev_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
57 ; VI: s_sub_i32
58 ; VI: s_sub_i32
59 ; VI: s_lshl_b32
60 ; VI: s_and_b32
5361 define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
5462 %add = sub <2 x i16> %a, %b
5563 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
5765 }
5866
5967 ; GCN-LABEL: {{^}}v_test_sub_v2i16_constant:
60 ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
68 ; GFX89-DAG: {{flat|global}}_load_dword
69
70 ; GFX9-DAG: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
6171 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
6272
6373 ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfffffe38
64 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
6574 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}}
75 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
76 ; VI: v_or_b32
6677 define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
6778 %tid = call i32 @llvm.amdgcn.workitem.id.x()
6879 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
94105 ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_neg1:
95106 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, -1 op_sel_hi:[1,0]{{$}}
96107
97 ; VI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
98 ; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
99 ; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
100 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD0]], [[ONE]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
101 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD1]]
108 ; VI-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
109 ; VI-DAG: flat_load_dword [[LOAD:v[0-9]+]]
110 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD]], [[ONE]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
111 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD]]
102112 ; VI: v_or_b32_e32
103113 define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
104114 %tid = call i32 @llvm.amdgcn.workitem.id.x()
113123 ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_lo_zero_hi:
114124 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, 32{{$}}
115125
116 ; VI-NOT: v_subrev_i16
117 ; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffffe0, v{{[0-9]+}}
118 ; VI-NOT: v_subrev_i16
119 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
120 ; VI: v_or_b32_e32
126 ; VI: flat_load_dword [[LOAD:v[0-9]+]]
127 ; VI-DAG: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, [[LOAD]]
128 ; VI-DAG: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffffe0, [[LOAD]]
129 ; VI: v_or_b32_e32 v{{[0-9]+}}, [[ADD]], [[AND]]
121130 define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
122131 %tid = call i32 @llvm.amdgcn.workitem.id.x()
123132 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
135144
136145 ; VI-NOT: v_subrev_i16
137146 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xffffc080
138 ; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
147 ; VI: flat_load_dword
148 ; VI: v_add_u16_sdwa [[ADD:v[0-9]+]], v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
139149 ; VI-NOT: v_subrev_i16
140 ; VI: v_or_b32_e32
150 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
141151 define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
142152 %tid = call i32 @llvm.amdgcn.workitem.id.x()
143153 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
158168 ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
159169 ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
160170
161 ; VI: flat_load_ushort v[[A_LO:[0-9]+]]
162 ; VI: flat_load_ushort v[[A_HI:[0-9]+]]
163
164 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
165 ; VI: flat_load_ushort v[[B_HI:[0-9]+]]
166
167 ; VI: v_sub_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
168 ; VI-NOT: and
169 ; VI-NOT: shl
170 ; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
171 ; VI-NOT: and
172 ; VI-NOT: shl
173 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
171 ; VI: flat_load_dword v[[A:[0-9]+]]
172 ; VI: flat_load_dword v[[B:[0-9]+]]
173
174 ; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], v[[A]], v[[B]]
175 ; VI-NEXT: v_sub_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
176 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
174177 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
175178 %tid = call i32 @llvm.amdgcn.workitem.id.x()
176179 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
195198 ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
196199 ; GFX9: buffer_store_dwordx4
197200
198 ; VI: flat_load_ushort v[[A_LO:[0-9]+]]
199 ; VI: flat_load_ushort v[[A_HI:[0-9]+]]
200 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
201 ; VI: flat_load_ushort v[[B_HI:[0-9]+]]
202
203 ; VI: v_sub_u16_e32
204 ; VI: v_sub_u16_e32
205
201 ; VI: flat_load_dword [[A:v[0-9]+]]
202 ; VI: flat_load_dword [[B:v[0-9]+]]
203 ; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], [[A]], [[B]]
204 ; VI: v_sub_u16_sdwa v[[ADD_HI:[0-9]+]], [[A]], [[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
206205 ; VI: buffer_store_dwordx4
207206 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
208207 %tid = call i32 @llvm.amdgcn.workitem.id.x()
227226 ; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
228227 ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
229228
230 ; VI: v_sub_u16_e32
231 ; VI: v_sub_u16_e32
229 ; VI: flat_load_dword
230 ; VI: flat_load_dword
231 ; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
232 ; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
233
232234 ; VI: buffer_store_dwordx2
233235 define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
234236 %tid = call i32 @llvm.amdgcn.workitem.id.x()
None ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s
1 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s
0 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9,GFX89 %s
1 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI,GFX89 %s
22
33 ; FIXME: Should still like to vectorize the memory operations for VI
44
55 ; Simple 3-pair chain with loads and stores
66 ; GCN-LABEL: @test1_as_3_3_3_v2f16(
7 ; GFX9: load <2 x half>, <2 x half> addrspace(3)*
8 ; GFX9: load <2 x half>, <2 x half> addrspace(3)*
9 ; GFX9: fmul <2 x half>
10 ; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
11 ; GFX9: ret
12
13 ; VI: load half
14 ; VI: load half
7 ; GFX89: load <2 x half>, <2 x half> addrspace(3)*
8 ; GFX89: load <2 x half>, <2 x half> addrspace(3)*
9 ; GFX89: fmul <2 x half>
10 ; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
11 ; GFX89: ret
1512 define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) {
1613 %i0 = load half, half addrspace(3)* %a, align 2
1714 %i1 = load half, half addrspace(3)* %b, align 2
2825 }
2926
3027 ; GCN-LABEL: @test1_as_3_0_0(
31 ; GFX9: load <2 x half>, <2 x half> addrspace(3)*
32 ; GFX9: load <2 x half>, <2 x half>*
33 ; GFX9: fmul <2 x half>
34 ; GFX9: store <2 x half> %{{.*}}, <2 x half>* %
35 ; GFX9: ret
36
37 ; VI: load half
38 ; VI: load half
28 ; GFX89: load <2 x half>, <2 x half> addrspace(3)*
29 ; GFX89: load <2 x half>, <2 x half>*
30 ; GFX89: fmul <2 x half>
31 ; GFX89: store <2 x half> %{{.*}}, <2 x half>* %
32 ; GFX89: ret
3933 define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) {
4034 %i0 = load half, half addrspace(3)* %a, align 2
4135 %i1 = load half, half* %b, align 2
5246 }
5347
5448 ; GCN-LABEL: @test1_as_0_0_3_v2f16(
55 ; GFX9: load <2 x half>, <2 x half>*
56 ; GFX9: load <2 x half>, <2 x half>*
57 ; GFX9: fmul <2 x half>
58 ; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
59 ; GFX9: ret
60
61 ; VI: load half
62 ; VI: load half
49 ; GFX89: load <2 x half>, <2 x half>*
50 ; GFX89: load <2 x half>, <2 x half>*
51 ; GFX89: fmul <2 x half>
52 ; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
53 ; GFX89: ret
6354 define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) {
6455 %i0 = load half, half* %a, align 2
6556 %i1 = load half, half* %b, align 2