llvm.org GIT mirror llvm / 0deee39
AMDGPU: Add VI i16 support Patch By: Wei Ding Differential Revision: https://reviews.llvm.org/D18049 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@286464 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 3 years ago
42 changed file(s) with 1496 addition(s) and 347 deletion(s). Raw diff Collapse all Expand all
492492
493493 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
494494
495 def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">;
496
495497 class PredicateControl {
496498 Predicate SubtargetPredicate;
497499 Predicate SIAssemblerPredicate = isSICI;
586586
587587 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
588588 // Truncate is just accessing a subregister.
589 return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
589
590 unsigned SrcSize = Source.getSizeInBits();
591 unsigned DestSize = Dest.getSizeInBits();
592
593 return DestSize < SrcSize && DestSize % 32 == 0 ;
590594 }
591595
592596 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
593597 // Truncate is just accessing a subregister.
594 return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() &&
595 (Dest->getPrimitiveSizeInBits() % 32 == 0);
598
599 unsigned SrcSize = Source->getScalarSizeInBits();
600 unsigned DestSize = Dest->getScalarSizeInBits();
601
602 if (DestSize== 16 && Subtarget->has16BitInsts())
603 return SrcSize >= 32;
604
605 return DestSize < SrcSize && DestSize % 32 == 0;
596606 }
597607
598608 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
599609 unsigned SrcSize = Src->getScalarSizeInBits();
600610 unsigned DestSize = Dest->getScalarSizeInBits();
611
612 if (SrcSize == 16 && Subtarget->has16BitInsts())
613 return DestSize >= 32;
601614
602615 return SrcSize == 32 && DestSize == 64;
603616 }
607620 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
608621 // this will enable reducing 64-bit operations the 32-bit, which is always
609622 // good.
623
624 if (Src == MVT::i16)
625 return Dest == MVT::i32 ||Dest == MVT::i64 ;
626
610627 return Src == MVT::i32 && Dest == MVT::i64;
611628 }
612629
24442461
24452462 unsigned Size = VT.getSizeInBits();
24462463 if (VT.isVector() || Size > 64)
2464 return SDValue();
2465
2466 // There are i16 integer mul/mad.
2467 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
24472468 return SDValue();
24482469
24492470 SelectionDAG &DAG = DCI.DAG;
528528
529529 def : Pat <
530530 (fcopysign f32:$src0, f32:$src1),
531 (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1)
531 (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1)
532532 >;
533533
534534 def : Pat <
535535 (f64 (fcopysign f64:$src0, f64:$src1)),
536536 (REG_SEQUENCE RC64,
537537 (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
538 (BFI_INT (LoadImm32 0x7fffffff),
538 (BFI_INT (LoadImm32 (i32 0x7fffffff)),
539539 (i32 (EXTRACT_SUBREG $src0, sub1)),
540540 (i32 (EXTRACT_SUBREG $src1, sub1))), sub1)
541541 >;
544544 (f64 (fcopysign f64:$src0, f32:$src1)),
545545 (REG_SEQUENCE RC64,
546546 (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
547 (BFI_INT (LoadImm32 0x7fffffff),
547 (BFI_INT (LoadImm32 (i32 0x7fffffff)),
548548 (i32 (EXTRACT_SUBREG $src0, sub1)),
549549 $src1), sub1)
550550 >;
707707 // int_SI_vs_load_input
708708 def : Pat<
709709 (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
710 (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0)
710 (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, (i32 0), imm:$attr_offset, 0, 0, 0)
711711 >;
712712
713713 // Offset in an 32-bit VGPR
714714 def : Pat <
715715 (SIload_constant v4i32:$sbase, i32:$voff),
716 (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0)
716 (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, (i32 0), 0, 0, 0, 0)
717717 >;
718718
719719
913913 >;
914914
915915
916 class MUBUFLoad_Pattern
916 class MUBUFLoad_PatternADDR64
917917 PatFrag constant_ld> : Pat <
918918 (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
919919 i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
935935 }
936936
937937 let Predicates = [isSICI] in {
938 def : MUBUFLoad_Pattern ;
939 def : MUBUFLoad_Pattern ;
940 def : MUBUFLoad_Pattern ;
941 def : MUBUFLoad_Pattern _constant>;
938 def : MUBUFLoad_PatternADDR64 _constant>;
939 def : MUBUFLoad_PatternADDR64 ;
940 def : MUBUFLoad_PatternADDR64 ;
941 def : MUBUFLoad_PatternADDR64 ;
942942
943943 defm : MUBUFLoad_Atomic_Pattern ;
944944 defm : MUBUFLoad_Atomic_Pattern ;
945945 } // End Predicates = [isSICI]
946
947 multiclass MUBUFLoad_Pattern
948 PatFrag ld> {
949
950 def : Pat <
951 (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset,
952 i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
953 (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe)
954 >;
955 }
956
957 let Predicates = [Has16BitInsts] in {
958
959 defm : MUBUFLoad_Pattern ;
960 defm : MUBUFLoad_Pattern ;
961 defm : MUBUFLoad_Pattern ;
962 defm : MUBUFLoad_Pattern ;
963
964 } // End Predicates = [Has16BitInsts]
946965
947966 class MUBUFScratchLoadPat : Pat <
948967 (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
952971
953972 def : MUBUFScratchLoadPat ;
954973 def : MUBUFScratchLoadPat ;
974 def : MUBUFScratchLoadPat ;
975 def : MUBUFScratchLoadPat ;
955976 def : MUBUFScratchLoadPat ;
956977 def : MUBUFScratchLoadPat ;
957978 def : MUBUFScratchLoadPat ;
10241045 defm : MUBUFStore_Atomic_Pattern ;
10251046 } // End Predicates = [isSICI]
10261047
1048
1049 multiclass MUBUFStore_Pattern
1050 PatFrag st> {
1051
1052 def : Pat <
1053 (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
1054 i16:$offset, i1:$glc, i1:$slc, i1:$tfe)),
1055 (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
1056 >;
1057 }
1058
1059 defm : MUBUFStore_Pattern ;
1060 defm : MUBUFStore_Pattern ;
1061
10271062 class MUBUFScratchStorePat : Pat <
10281063 (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset,
10291064 u16imm:$offset)),
10321067
10331068 def : MUBUFScratchStorePat ;
10341069 def : MUBUFScratchStorePat ;
1070 def : MUBUFScratchStorePat ;
1071 def : MUBUFScratchStorePat ;
10351072 def : MUBUFScratchStorePat ;
10361073 def : MUBUFScratchStorePat ;
10371074 def : MUBUFScratchStorePat ;
488488
489489 def : DSReadPat ;
490490 def : DSReadPat ;
491 def : DSReadPat ;
492 def : DSReadPat ;
493 def : DSReadPat ;
491494 def : DSReadPat ;
492495 def : DSReadPat ;
496 def : DSReadPat ;
493497 def : DSReadPat ;
494498
495499 let AddedComplexity = 100 in {
511515
512516 def : DSWritePat ;
513517 def : DSWritePat ;
518 def : DSWritePat ;
519 def : DSWritePat ;
514520 def : DSWritePat ;
515521
516522 let AddedComplexity = 100 in {
521527 def : Pat <
522528 (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
523529 i8:$offset1)),
524 (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0),
525 (EXTRACT_SUBREG $value, sub1), $offset0, $offset1,
530 (DS_WRITE2_B32 $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
531 (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
526532 (i1 0))
527533 >;
528534
340340
341341 def : FlatLoadPat ;
342342 def : FlatLoadPat ;
343 def : FlatLoadPat ;
344 def : FlatLoadPat ;
343345 def : FlatLoadPat ;
344346 def : FlatLoadPat ;
345347 def : FlatLoadPat ;
388390
389391 } // End Predicates = [isCIVI]
390392
393 let Predicates = [isVI] in {
394 def : FlatStorePat ;
395 def : FlatStorePat ;
396 }
391397
392398
393399 //===----------------------------------------------------------------------===//
7676
7777 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
7878 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
79
80 if (Subtarget->has16BitInsts())
81 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
7982
8083 computeRegisterProperties(STI.getRegisterInfo());
8184
219222 setOperationAction(ISD::FCOS, MVT::f32, Custom);
220223 setOperationAction(ISD::FDIV, MVT::f32, Custom);
221224 setOperationAction(ISD::FDIV, MVT::f64, Custom);
225
226 if (Subtarget->has16BitInsts()) {
227 setOperationAction(ISD::Constant, MVT::i16, Legal);
228
229 setOperationAction(ISD::SMIN, MVT::i16, Legal);
230 setOperationAction(ISD::SMAX, MVT::i16, Legal);
231
232 setOperationAction(ISD::UMIN, MVT::i16, Legal);
233 setOperationAction(ISD::UMAX, MVT::i16, Legal);
234
235 setOperationAction(ISD::SETCC, MVT::i16, Promote);
236 AddPromotedToType(ISD::SETCC, MVT::i16, MVT::i32);
237
238 setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
239 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
240
241 setOperationAction(ISD::ROTR, MVT::i16, Promote);
242 setOperationAction(ISD::ROTL, MVT::i16, Promote);
243
244 setOperationAction(ISD::SDIV, MVT::i16, Promote);
245 setOperationAction(ISD::UDIV, MVT::i16, Promote);
246 setOperationAction(ISD::SREM, MVT::i16, Promote);
247 setOperationAction(ISD::UREM, MVT::i16, Promote);
248
249 setOperationAction(ISD::BSWAP, MVT::i16, Promote);
250 setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
251
252 setOperationAction(ISD::CTTZ, MVT::i16, Promote);
253 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
254 setOperationAction(ISD::CTLZ, MVT::i16, Promote);
255 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
256
257 setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
258
259 setOperationAction(ISD::BR_CC, MVT::i16, Expand);
260
261 setOperationAction(ISD::LOAD, MVT::i16, Custom);
262
263 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
264
265 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
266 AddPromotedToType(ISD::UINT_TO_FP, MVT::i16, MVT::i32);
267 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
268 AddPromotedToType(ISD::SINT_TO_FP, MVT::i16, MVT::i32);
269 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
270 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
271 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
272 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
273 }
222274
223275 setTargetDAGCombine(ISD::FADD);
224276 setTargetDAGCombine(ISD::FSUB);
25572609 EVT MemVT = Load->getMemoryVT();
25582610
25592611 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
2560 assert(MemVT == MVT::i1 && "Only i1 non-extloads expected");
25612612 // FIXME: Copied from PPC
25622613 // First, load into 32 bits, then truncate to 1 bit.
25632614
25652616 SDValue BasePtr = Load->getBasePtr();
25662617 MachineMemOperand *MMO = Load->getMemOperand();
25672618
2619 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
2620
25682621 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
2569 BasePtr, MVT::i8, MMO);
2622 BasePtr, RealMemVT, MMO);
25702623
25712624 SDValue Ops[] = {
25722625 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
33803433 }
33813434
33823435 EVT VT = K0->getValueType(0);
3383 return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
3384 Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
3436
3437 MVT NVT = MVT::i32;
3438 unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3439
3440 SDValue Tmp1, Tmp2, Tmp3;
3441 Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
3442 Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
3443 Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
3444
3445 if (VT == MVT::i16) {
3446 Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT,
3447 Tmp1, Tmp2, Tmp3);
3448
3449 return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1);
3450 } else
3451 return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
3452 Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
33853453 }
33863454
33873455 static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
11271127
11281128 include "SIInstructions.td"
11291129 include "CIInstructions.td"
1130 include "VIInstructions.td"
11311130
11321131 include "DSInstructions.td"
11331132 include "MIMGInstructions.td"
373373
374374 def : Pat <
375375 (int_AMDGPU_kilp),
376 (SI_KILL 0xbf800000)
376 (SI_KILL (i32 0xbf800000))
377377 >;
378378
379379 def : Pat <
554554 def : Pat <
555555 (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod),
556556 (f32 FP_ZERO), (f32 FP_ONE)),
557 (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod)
557 (V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod)
558558 >;
559559
560560 /********** ================================ **********/
565565
566566 def : Pat <
567567 (fneg (fabs f32:$src)),
568 (S_OR_B32 $src, (S_MOV_B32 0x80000000)) // Set sign bit
568 (S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit
569569 >;
570570
571571 // FIXME: Should use S_OR_B32
574574 (REG_SEQUENCE VReg_64,
575575 (i32 (EXTRACT_SUBREG f64:$src, sub0)),
576576 sub0,
577 (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
578 (V_MOV_B32_e32 0x80000000)), // Set sign bit.
577 (V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
578 (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
579579 sub1)
580580 >;
581581
582582 def : Pat <
583583 (fabs f32:$src),
584 (V_AND_B32_e64 $src, (V_MOV_B32_e32 0x7fffffff))
584 (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff)))
585585 >;
586586
587587 def : Pat <
588588 (fneg f32:$src),
589 (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000))
589 (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000)))
590590 >;
591591
592592 def : Pat <
594594 (REG_SEQUENCE VReg_64,
595595 (i32 (EXTRACT_SUBREG f64:$src, sub0)),
596596 sub0,
597 (V_AND_B32_e64 (EXTRACT_SUBREG f64:$src, sub1),
598 (V_MOV_B32_e32 0x7fffffff)), // Set sign bit.
597 (V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
598 (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit.
599599 sub1)
600600 >;
601601
604604 (REG_SEQUENCE VReg_64,
605605 (i32 (EXTRACT_SUBREG f64:$src, sub0)),
606606 sub0,
607 (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
608 (V_MOV_B32_e32 0x80000000)),
607 (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
608 (i32 (V_MOV_B32_e32 (i32 0x80000000)))),
609609 sub1)
610610 >;
611611
665665 def : Pat <
666666 (int_AMDGPU_cube v4f32:$src),
667667 (REG_SEQUENCE VReg_128,
668 (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0),
669 0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1),
670 0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2),
668 (V_CUBETC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
669 0 /* src1_modifiers */, (f32 (EXTRACT_SUBREG $src, sub1)),
670 0 /* src2_modifiers */, (f32 (EXTRACT_SUBREG $src, sub2)),
671671 0 /* clamp */, 0 /* omod */), sub0,
672 (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0),
673 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
674 0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2),
672 (V_CUBESC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
673 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
674 0 /* src2_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
675675 0 /* clamp */, 0 /* omod */), sub1,
676 (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0),
677 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
678 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2),
676 (V_CUBEMA_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
677 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
678 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
679679 0 /* clamp */, 0 /* omod */), sub2,
680 (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0),
681 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
682 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2),
680 (V_CUBEID_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
681 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
682 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
683683 0 /* clamp */, 0 /* omod */), sub3)
684684 >;
685685
700700 def : Pat <
701701 (AMDGPUurecip i32:$src0),
702702 (V_CVT_U32_F32_e32
703 (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1,
703 (V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1),
704704 (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
705705 >;
706706
766766 //===----------------------------------------------------------------------===//
767767
768768 def : Pat<(i32 (sext_inreg i32:$src, i1)),
769 (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16
769 (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
770770
771771 // Handle sext_inreg in i64
772772 def : Pat <
773773 (i64 (sext_inreg i64:$src, i1)),
774 (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16
774 (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16
775 >;
776
777 def : Pat <
778 (i16 (sext_inreg i16:$src, i8)),
779 (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
775780 >;
776781
777782 def : Pat <
778783 (i64 (sext_inreg i64:$src, i8)),
779 (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16
784 (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16
780785 >;
781786
782787 def : Pat <
783788 (i64 (sext_inreg i64:$src, i16)),
784 (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16
789 (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16
785790 >;
786791
787792 def : Pat <
788793 (i64 (sext_inreg i64:$src, i32)),
789 (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16
794 (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
790795 >;
791796
792797 def : Pat <
793798 (i64 (zext i32:$src)),
794 (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1)
799 (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
795800 >;
796801
797802 def : Pat <
803808 (i64 (ext i1:$src)),
804809 (REG_SEQUENCE VReg_64,
805810 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0,
806 (S_MOV_B32 0), sub1)
811 (S_MOV_B32 (i32 0)), sub1)
807812 >;
808813
809814
815820 def : Pat <
816821 (i64 (sext i32:$src)),
817822 (REG_SEQUENCE SReg_64, $src, sub0,
818 (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, 31), SReg_32_XM0)), sub1)
823 (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
819824 >;
820825
821826 def : Pat <
822827 (i64 (sext i1:$src)),
823828 (REG_SEQUENCE VReg_64,
824 (V_CNDMASK_B32_e64 0, -1, $src), sub0,
825 (V_CNDMASK_B32_e64 0, -1, $src), sub1)
826 >;
827
828 class FPToI1Pat : Pat <
829 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0,
830 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1)
831 >;
832
833 class FPToI1Pat : Pat <
829834 (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
830 (i1 (Inst 0, KOne, $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
831 >;
832
833 def : FPToI1Pat;
834 def : FPToI1Pat;
835 def : FPToI1Pat;
836 def : FPToI1Pat;
835 (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
836 >;
837
838 def : FPToI1Pat;
839 def : FPToI1Pat;
840 def : FPToI1Pat;
841 def : FPToI1Pat;
837842
838843 // If we need to perform a logical operation on i1 values, we need to
839844 // use vector comparisons since there is only one SCC register. Vector
858863
859864 def : Pat <
860865 (f32 (sint_to_fp i1:$src)),
861 (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src)
866 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)
862867 >;
863868
864869 def : Pat <
865870 (f32 (uint_to_fp i1:$src)),
866 (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src)
871 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)
867872 >;
868873
869874 def : Pat <
887892
888893 def : Pat <
889894 (i1 (trunc i32:$a)),
890 (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), 1)
895 (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
891896 >;
892897
893898 def : Pat <
894899 (i1 (trunc i64:$a)),
895900 (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1),
896 (EXTRACT_SUBREG $a, sub0)), 1)
901 (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
897902 >;
898903
899904 def : Pat <
900905 (i32 (bswap i32:$a)),
901 (V_BFI_B32 (S_MOV_B32 0x00ff00ff),
902 (V_ALIGNBIT_B32 $a, $a, 24),
903 (V_ALIGNBIT_B32 $a, $a, 8))
906 (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
907 (V_ALIGNBIT_B32 $a, $a, (i32 24)),
908 (V_ALIGNBIT_B32 $a, $a, (i32 8)))
904909 >;
905910
906911 def : Pat <
916921
917922 def : Pat <
918923 (vt (add (vt (shl 1, vt:$a)), -1)),
919 (BFM $a, (MOV 0))
924 (BFM $a, (MOV (i32 0)))
920925 >;
921926 }
922927
927932
928933 def : Pat<
929934 (fcanonicalize f32:$src),
930 (V_MUL_F32_e64 0, CONST.FP32_ONE, 0, $src, 0, 0)
935 (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0)
931936 >;
932937
933938 def : Pat<
962967 (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
963968 DSTCLAMP.NONE, DSTOMOD.NONE),
964969 $x,
965 (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)),
970 (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))),
966971 DSTCLAMP.NONE, DSTOMOD.NONE)
967972 >;
968973
122122 // TODO: Do we need to set DwarfRegAlias on register tuples?
123123
124124 // SGPR 32-bit registers
125 def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
125 def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32,
126126 (add (sequence "SGPR%u", 0, 103))> {
127127 let AllocationPriority = 1;
128128 }
189189 (add (decimate (shl TTMP_32, 3), 4))]>;
190190
191191 // VGPR 32-bit registers
192 def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
192 // i16 only on VI+
193 def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32,
193194 (add (sequence "VGPR%u", 0, 255))> {
194195 let AllocationPriority = 1;
195196 let Size = 32;
257258 }
258259
259260 // Register class for all scalar registers (SGPRs + Special Registers)
260 def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
261 (add SReg_32_XM0, M0)> {
261 def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32,
262 (add SReg_32_XM0, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)> {
262263 let AllocationPriority = 1;
263264 }
264265
345346 let Size = 32;
346347 }
347348
348 def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)> {
349 def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, (add VGPR_32, SReg_32)> {
349350 let isAllocatable = 0;
350351 }
351352
878878 (i64 (ctpop i64:$src)),
879879 (i64 (REG_SEQUENCE SReg_64,
880880 (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0,
881 (S_MOV_B32 0), sub1))
881 (S_MOV_B32 (i32 0)), sub1))
882882 >;
883883
884884 def : Pat <
885885 (i32 (smax i32:$x, (i32 (ineg i32:$x)))),
886886 (S_ABS_I32 $x)
887887 >;
888
889 def : Pat <
890 (i16 imm:$imm),
891 (S_MOV_B32 imm:$imm)
892 >;
893
894 // Same as a 32-bit inreg
895 def : Pat<
896 (i32 (sext i16:$src)),
897 (S_SEXT_I32_I16 $src)
898 >;
899
888900
889901 //===----------------------------------------------------------------------===//
890902 // SOP2 Patterns
896908 (i32 (addc i32:$src0, i32:$src1)),
897909 (S_ADD_U32 $src0, $src1)
898910 >;
911
912 // FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
913 // REG_SEQUENCE patterns don't support instructions with multiple
914 // outputs.
915 def : Pat<
916 (i64 (zext i16:$src)),
917 (REG_SEQUENCE SReg_64,
918 (i32 (COPY_TO_REGCLASS (S_AND_B32 $src, (S_MOV_B32 (i32 0xffff))), SGPR_32)), sub0,
919 (S_MOV_B32 (i32 0)), sub1)
920 >;
921
922 def : Pat <
923 (i64 (sext i16:$src)),
924 (REG_SEQUENCE SReg_64, (i32 (S_SEXT_I32_I16 $src)), sub0,
925 (i32 (COPY_TO_REGCLASS (S_ASHR_I32 (i32 (S_SEXT_I32_I16 $src)), (S_MOV_B32 (i32 31))), SGPR_32)), sub1)
926 >;
927
928 def : Pat<
929 (i32 (zext i16:$src)),
930 (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
931 >;
932
933
899934
900935 //===----------------------------------------------------------------------===//
901936 // SOPP Patterns
77 //===----------------------------------------------------------------------===//
88 // Instruction definitions for VI and newer.
99 //===----------------------------------------------------------------------===//
10
11 FIXME: Deleting this file broke buildbots that don't do full rebuilds. This
12 file is no longer used by the backend, so it can be deleted once all
13 the buildbots update there dependencies.
297297 defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16>;
298298 defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16>;
299299 defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16>;
300
301 }
302
303 let Predicates = [isVI] in {
304
305 def : Pat<
306 (f32 (f16_to_fp i16:$src)),
307 (V_CVT_F32_F16_e32 $src)
308 >;
309
310 def : Pat<
311 (i16 (fp_to_f16 f32:$src)),
312 (V_CVT_F16_F32_e32 $src)
313 >;
300314
301315 }
302316
560574 let Predicates = [isVI] in {
561575
562576 def : Pat <
563 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
564 imm:$bound_ctrl),
577 (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
578 imm:$bound_ctrl)),
565579 (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask),
566580 (as_i32imm $bank_mask), (as_i1imm $bound_ctrl))
567581 >;
568582
583
584 def : Pat<
585 (i32 (anyext i16:$src)),
586 (COPY $src)
587 >;
588
589 def : Pat<
590 (i64 (anyext i16:$src)),
591 (REG_SEQUENCE VReg_64,
592 (i32 (COPY $src)), sub0,
593 (V_MOV_B32_e32 (i32 0)), sub1)
594 >;
595
596 def : Pat<
597 (i16 (trunc i32:$src)),
598 (COPY $src)
599 >;
600
601 def : Pat<
602 (i1 (trunc i16:$src)),
603 (COPY $src)
604 >;
605
606
607 def : Pat <
608 (i16 (trunc i64:$src)),
609 (EXTRACT_SUBREG $src, sub0)
610 >;
611
569612 } // End Predicates = [isVI]
343343 } // End isCommutable = 1
344344
345345 } // End SubtargetPredicate = isVI
346
347 // Note: 16-bit instructions produce a 0 result in the high 16-bits.
348 multiclass Arithmetic_i16_Pats {
349
350 def : Pat<
351 (op i16:$src0, i16:$src1),
352 (inst $src0, $src1)
353 >;
354
355 def : Pat<
356 (i32 (zext (op i16:$src0, i16:$src1))),
357 (inst $src0, $src1)
358 >;
359
360 def : Pat<
361 (i64 (zext (op i16:$src0, i16:$src1))),
362 (REG_SEQUENCE VReg_64,
363 (inst $src0, $src1), sub0,
364 (V_MOV_B32_e32 (i32 0)), sub1)
365 >;
366
367 }
368
369 multiclass Bits_OpsRev_i16_Pats {
370
371 def : Pat<
372 (op i16:$src0, i32:$src1),
373 (inst $src1, $src0)
374 >;
375
376 def : Pat<
377 (i32 (zext (op i16:$src0, i32:$src1))),
378 (inst $src1, $src0)
379 >;
380
381
382 def : Pat<
383 (i64 (zext (op i16:$src0, i32:$src1))),
384 (REG_SEQUENCE VReg_64,
385 (inst $src1, $src0), sub0,
386 (V_MOV_B32_e32 (i32 0)), sub1)
387 >;
388 }
389
390 class ZExt_i16_i1_Pat : Pat <
391 (i16 (ext i1:$src)),
392 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)
393 >;
394
395 let Predicates = [isVI] in {
396
397 defm : Arithmetic_i16_Pats;
398 defm : Arithmetic_i16_Pats;
399 defm : Arithmetic_i16_Pats;
400 defm : Arithmetic_i16_Pats;
401 defm : Arithmetic_i16_Pats;
402 defm : Arithmetic_i16_Pats;
403 defm : Arithmetic_i16_Pats;
404
405 defm : Arithmetic_i16_Pats;
406 defm : Arithmetic_i16_Pats;
407 defm : Arithmetic_i16_Pats;
408
409 defm : Bits_OpsRev_i16_Pats;
410 defm : Bits_OpsRev_i16_Pats;
411 defm : Bits_OpsRev_i16_Pats;
412
413 def : ZExt_i16_i1_Pat;
414 def : ZExt_i16_i1_Pat;
415 def : ZExt_i16_i1_Pat;
416
417 } // End Predicates = [isVI]
346418
347419 //===----------------------------------------------------------------------===//
348420 // SI
220220 }
221221
222222 } // End SubtargetPredicate = isVI
223
224 def : Pat <
225 (i16 (select i1:$src0, i16:$src1, i16:$src2)),
226 (V_CNDMASK_B32_e64 $src2, $src1, $src0)
227 >;
228
229 let Predicates = [isVI] in {
230
231 multiclass Tenary_i16_Pats
232 Instruction inst, SDPatternOperator op3> {
233 def : Pat<
234 (op2 (op1 i16:$src0, i16:$src1), i16:$src2),
235 (inst i16:$src0, i16:$src1, i16:$src2)
236 >;
237
238 def : Pat<
239 (i32 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
240 (inst i16:$src0, i16:$src1, i16:$src2)
241 >;
242
243 def : Pat<
244 (i64 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
245 (REG_SEQUENCE VReg_64,
246 (inst i16:$src0, i16:$src1, i16:$src2), sub0,
247 (V_MOV_B32_e32 (i32 0)), sub1)
248 >;
249 }
250
251 defm: Tenary_i16_Pats;
252 defm: Tenary_i16_Pats;
253
254 } // End Predicates = [isVI]
223255
224256
225257 //===----------------------------------------------------------------------===//
0 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
1
2 ; FIXME: Need to handle non-uniform case for function below (load without gep).
3 ; GCN-LABEL: {{^}}v_test_add_i16:
4 ; VI: flat_load_ushort [[A:v[0-9]+]]
5 ; VI: flat_load_ushort [[B:v[0-9]+]]
6 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
7 ; VI-NEXT: buffer_store_short [[ADD]]
8 define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
9 %tid = call i32 @llvm.amdgcn.workitem.id.x()
10 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
11 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
12 %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
13 %a = load volatile i16, i16 addrspace(1)* %gep.in0
14 %b = load volatile i16, i16 addrspace(1)* %gep.in1
15 %add = add i16 %a, %b
16 store i16 %add, i16 addrspace(1)* %out
17 ret void
18 }
19
20 ; FIXME: Need to handle non-uniform case for function below (load without gep).
21 ; GCN-LABEL: {{^}}v_test_add_i16_constant:
22 ; VI: flat_load_ushort [[A:v[0-9]+]]
23 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x7b, [[A]]
24 ; VI-NEXT: buffer_store_short [[ADD]]
25 define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
26 %tid = call i32 @llvm.amdgcn.workitem.id.x()
27 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
28 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
29 %a = load volatile i16, i16 addrspace(1)* %gep.in0
30 %add = add i16 %a, 123
31 store i16 %add, i16 addrspace(1)* %out
32 ret void
33 }
34
35 ; FIXME: Need to handle non-uniform case for function below (load without gep).
36 ; GCN-LABEL: {{^}}v_test_add_i16_neg_constant:
37 ; VI: flat_load_ushort [[A:v[0-9]+]]
38 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xfffffcb3, [[A]]
39 ; VI-NEXT: buffer_store_short [[ADD]]
40 define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
41 %tid = call i32 @llvm.amdgcn.workitem.id.x()
42 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
43 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
44 %a = load volatile i16, i16 addrspace(1)* %gep.in0
45 %add = add i16 %a, -845
46 store i16 %add, i16 addrspace(1)* %out
47 ret void
48 }
49
50 ; FIXME: Need to handle non-uniform case for function below (load without gep).
51 ; GCN-LABEL: {{^}}v_test_add_i16_inline_neg1:
52 ; VI: flat_load_ushort [[A:v[0-9]+]]
53 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], -1, [[A]]
54 ; VI-NEXT: buffer_store_short [[ADD]]
55 define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
56 %tid = call i32 @llvm.amdgcn.workitem.id.x()
57 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
58 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
59 %a = load volatile i16, i16 addrspace(1)* %gep.in0
60 %add = add i16 %a, -1
61 store i16 %add, i16 addrspace(1)* %out
62 ret void
63 }
64
65 ; FIXME: Need to handle non-uniform case for function below (load without gep).
66 ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i32:
67 ; VI: flat_load_ushort [[A:v[0-9]+]]
68 ; VI: flat_load_ushort [[B:v[0-9]+]]
69 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
70 ; VI-NEXT: buffer_store_dword [[ADD]]
71 define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
72 %tid = call i32 @llvm.amdgcn.workitem.id.x()
73 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
74 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
75 %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
76 %a = load volatile i16, i16 addrspace(1)* %gep.in0
77 %b = load volatile i16, i16 addrspace(1)* %gep.in1
78 %add = add i16 %a, %b
79 %ext = zext i16 %add to i32
80 store i32 %ext, i32 addrspace(1)* %out
81 ret void
82 }
83
84 ; FIXME: Need to handle non-uniform case for function below (load without gep).
85 ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64:
86 ; VI: flat_load_ushort [[A:v[0-9]+]]
87 ; VI: flat_load_ushort [[B:v[0-9]+]]
88 ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
89 ; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
90 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
91 define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
92 %tid = call i32 @llvm.amdgcn.workitem.id.x()
93 %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
94 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
95 %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
96 %a = load volatile i16, i16 addrspace(1)* %gep.in0
97 %b = load volatile i16, i16 addrspace(1)* %gep.in1
98 %add = add i16 %a, %b
99 %ext = zext i16 %add to i64
100 store i64 %ext, i64 addrspace(1)* %out
101 ret void
102 }
103
104 ; FIXME: Need to handle non-uniform case for function below (load without gep).
105 ; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i32:
106 ; VI: flat_load_ushort [[A:v[0-9]+]]
107 ; VI: flat_load_ushort [[B:v[0-9]+]]
108 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
109 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
110 ; VI-NEXT: buffer_store_dword [[SEXT]]
111 define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
112 %tid = call i32 @llvm.amdgcn.workitem.id.x()
113 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
114 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
115 %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
116 %a = load i16, i16 addrspace(1)* %gep.in0
117 %b = load i16, i16 addrspace(1)* %gep.in1
118 %add = add i16 %a, %b
119 %ext = sext i16 %add to i32
120 store i32 %ext, i32 addrspace(1)* %out
121 ret void
122 }
123
124 ; FIXME: Need to handle non-uniform case for function below (load without gep).
125 ; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i64:
126 ; VI: flat_load_ushort [[A:v[0-9]+]]
127 ; VI: flat_load_ushort [[B:v[0-9]+]]
128 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
129 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
130 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
131 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
132 define void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
133 %tid = call i32 @llvm.amdgcn.workitem.id.x()
134 %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
135 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
136 %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
137 %a = load i16, i16 addrspace(1)* %gep.in0
138 %b = load i16, i16 addrspace(1)* %gep.in1
139 %add = add i16 %a, %b
140 %ext = sext i16 %add to i64
141 store i64 %ext, i64 addrspace(1)* %out
142 ret void
143 }
144
145 declare i32 @llvm.amdgcn.workitem.id.x() #0
146
147 attributes #0 = { nounwind readnone }
148 attributes #1 = { nounwind }
None ; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
1 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
0 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
3 ; CHECK-LABEL: {{^}}anyext_i1_i32:
4 ; CHECK: v_cndmask_b32_e64
3 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
4 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
5
6 ; GCN-LABEL: {{^}}anyext_i1_i32:
7 ; GCN: v_cndmask_b32_e64
58 define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
69 entry:
7 %0 = icmp eq i32 %cond, 0
8 %1 = zext i1 %0 to i8
9 %2 = xor i8 %1, -1
10 %3 = and i8 %2, 1
11 %4 = zext i8 %3 to i32
12 store i32 %4, i32 addrspace(1)* %out
10 %tmp = icmp eq i32 %cond, 0
11 %tmp1 = zext i1 %tmp to i8
12 %tmp2 = xor i8 %tmp1, -1
13 %tmp3 = and i8 %tmp2, 1
14 %tmp4 = zext i8 %tmp3 to i32
15 store i32 %tmp4, i32 addrspace(1)* %out
1316 ret void
1417 }
18
19 ; GCN-LABEL: {{^}}s_anyext_i16_i32:
20 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]],
21 ; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], -1, [[ADD]]
22 ; VI: v_and_b32_e32 [[AND:v[0-9]+]], 1, [[XOR]]
23 ; VI: buffer_store_dword [[AND]]
24 define void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) {
25 entry:
26 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
27 %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
28 %a.ptr = getelementptr i16, i16 addrspace(1)* %a, i32 %tid.x
29 %b.ptr = getelementptr i16, i16 addrspace(1)* %b, i32 %tid.y
30 %a.l = load i16, i16 addrspace(1)* %a.ptr
31 %b.l = load i16, i16 addrspace(1)* %b.ptr
32 %tmp = add i16 %a.l, %b.l
33 %tmp1 = trunc i16 %tmp to i8
34 %tmp2 = xor i8 %tmp1, -1
35 %tmp3 = and i8 %tmp2, 1
36 %tmp4 = zext i8 %tmp3 to i32
37 store i32 %tmp4, i32 addrspace(1)* %out
38 ret void
39 }
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
11 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
23
34 declare i16 @llvm.bitreverse.i16(i16) #1
45 declare i32 @llvm.bitreverse.i32(i32) #1
1112 declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1
1213
1314 ; FUNC-LABEL: {{^}}s_brev_i16:
14 ; SI: s_brev_b32
15 ; SI: s_brev_b32
1516 define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
1617 %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
1718 store i16 %brev, i16 addrspace(1)* %out
115115 ; OPT: store
116116 ; OPT: ret
117117
118 ; For GFX8: since i16 is legal type, we cannot sink lshr into BBs.
118119
119120 ; GCN-LABEL: {{^}}sink_ubfe_i16:
120121 ; GCN-NOT: lshr
122 ; VI: s_bfe_u32 s0, s0, 0xc0004
121123 ; GCN: s_cbranch_vccnz
122124
123 ; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004
125 ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004
126 ; VI: s_and_b32 s0, s0, 0xff
127
124128 ; GCN: BB2_2:
125 ; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004
129 ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004
130 ; VI: s_and_b32 s0, s0, 0x7f
126131
127132 ; GCN: BB2_3:
128133 ; GCN: buffer_store_short
None ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
2
3 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
4 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
25
36 ; FUNC-LABEL: {{^}}test_copy_v4i8:
4 ; SI: buffer_load_dword [[REG:v[0-9]+]]
5 ; SI: buffer_store_dword [[REG]]
6 ; SI: s_endpgm
7 ; GCN: buffer_load_dword [[REG:v[0-9]+]]
8 ; GCN: buffer_store_dword [[REG]]
9 ; GCN: s_endpgm
710 define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
811 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
912 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
1114 }
1215
1316 ; FUNC-LABEL: {{^}}test_copy_v4i8_x2:
14 ; SI: buffer_load_dword [[REG:v[0-9]+]]
15 ; SI: buffer_store_dword [[REG]]
16 ; SI: buffer_store_dword [[REG]]
17 ; SI: s_endpgm
17 ; GCN: buffer_load_dword [[REG:v[0-9]+]]
18 ; GCN: buffer_store_dword [[REG]]
19 ; GCN: buffer_store_dword [[REG]]
20 ; GCN: s_endpgm
1821 define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
1922 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
2023 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
2326 }
2427
2528 ; FUNC-LABEL: {{^}}test_copy_v4i8_x3:
26 ; SI: buffer_load_dword [[REG:v[0-9]+]]
27 ; SI: buffer_store_dword [[REG]]
28 ; SI: buffer_store_dword [[REG]]
29 ; SI: buffer_store_dword [[REG]]
30 ; SI: s_endpgm
29 ; GCN: buffer_load_dword [[REG:v[0-9]+]]
30 ; GCN: buffer_store_dword [[REG]]
31 ; GCN: buffer_store_dword [[REG]]
32 ; GCN: buffer_store_dword [[REG]]
33 ; GCN: s_endpgm
3134 define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
3235 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
3336 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
3740 }
3841
3942 ; FUNC-LABEL: {{^}}test_copy_v4i8_x4:
40 ; SI: buffer_load_dword [[REG:v[0-9]+]]
41 ; SI: buffer_store_dword [[REG]]
42 ; SI: buffer_store_dword [[REG]]
43 ; SI: buffer_store_dword [[REG]]
44 ; SI: buffer_store_dword [[REG]]
45 ; SI: s_endpgm
43 ; GCN: buffer_load_dword [[REG:v[0-9]+]]
44 ; GCN: buffer_store_dword [[REG]]
45 ; GCN: buffer_store_dword [[REG]]
46 ; GCN: buffer_store_dword [[REG]]
47 ; GCN: buffer_store_dword [[REG]]
48 ; GCN: s_endpgm
4649 define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
4750 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
4851 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
5356 }
5457
5558 ; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use:
56 ; SI: buffer_load_dword
57 ; SI-DAG: v_lshrrev_b32
58 ; SI: v_and_b32
59 ; SI: v_or_b32
60 ; SI-DAG: buffer_store_dword
61 ; SI-DAG: buffer_store_dword
59 ; GCN: buffer_load_dword
60 ; GCN-DAG: v_lshrrev_b32
61 ; GCN: v_and_b32
62 ; GCN: v_or_b32
63 ; GCN-DAG: buffer_store_dword
64 ; GCN-DAG: buffer_store_dword
6265
63 ; SI: s_endpgm
66 ; GCN: s_endpgm
6467 define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
6568 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
6669 %add = add <4 x i8> %val,
6972 ret void
7073 }
7174
75 ; FIXME: Need to handle non-uniform case for function below (load without gep).
7276 ; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use:
73 ; SI: buffer_load_dword
74 ; SI-DAG: v_lshrrev_b32
77 ; GCN: {{buffer|flat}}_load_dword
78 ; GCN-DAG: v_lshrrev_b32
7579 ; SI-DAG: v_add_i32
76 ; SI-DAG: v_and_b32
77 ; SI-DAG: v_or_b32
78 ; SI-DAG: buffer_store_dword
79 ; SI: buffer_store_dword
80 ; SI: buffer_store_dword
81 ; SI: s_endpgm
80 ; VI-DAG: v_add_u16
81 ; GCN-DAG: v_and_b32
82 ; GCN-DAG: v_or_b32
83 ; GCN-DAG: {{buffer|flat}}_store_dword
84 ; GCN: {{buffer|flat}}_store_dword
85 ; GCN: {{buffer|flat}}_store_dword
86 ; GCN: s_endpgm
8287 define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
83 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
88 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
89 %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
90 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
8491 %add = add <4 x i8> %val,
8592 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
8693 store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
8996 }
9097
9198 ; FUNC-LABEL: {{^}}test_copy_v3i8_align4:
92 ; SI: buffer_load_dword
93 ; SI-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
94 ; SI-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
95 ; SI: s_endpgm
99 ; GCN: buffer_load_dword
100 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
101 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
102 ; GCN: s_endpgm
96103 define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
97104 %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
98105 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
100107 }
101108
102109 ; FUNC-LABEL: {{^}}test_copy_v3i8_align2:
103 ; SI-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
104 ; SI-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
105 ; SI-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
106 ; SI-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
107 ; SI: s_endpgm
110 ; GCN-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
111 ; GCN-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
112 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
113 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
114 ; GCN: s_endpgm
108115 define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
109116 %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2
110117 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2
112119 }
113120
114121 ; FUNC-LABEL: {{^}}test_copy_v3i8_align1:
115 ; SI: buffer_load_ubyte
116 ; SI: buffer_load_ubyte
117 ; SI: buffer_load_ubyte
122 ; GCN: buffer_load_ubyte
123 ; GCN: buffer_load_ubyte
124 ; GCN: buffer_load_ubyte
118125
119 ; SI: buffer_store_byte
120 ; SI: buffer_store_byte
121 ; SI: buffer_store_byte
122 ; SI: s_endpgm
126 ; GCN: buffer_store_byte
127 ; GCN: buffer_store_byte
128 ; GCN: buffer_store_byte
129 ; GCN: s_endpgm
123130 define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
124131 %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1
125132 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1
127134 }
128135
129136 ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load:
130 ; SI: buffer_load_ubyte
131 ; SI: buffer_load_ubyte
132 ; SI: buffer_load_ubyte
133 ; SI: buffer_load_ubyte
134 ; SI: buffer_store_dword
135 ; SI: s_endpgm
137 ; GCN: buffer_load_ubyte
138 ; GCN: buffer_load_ubyte
139 ; GCN: buffer_load_ubyte
140 ; GCN: buffer_load_ubyte
141 ; GCN: buffer_store_dword
142 ; GCN: s_endpgm
136143 define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
137144 %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
138145 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
140147 }
141148
142149 ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store:
143 ; SI: buffer_load_ubyte
144 ; SI: buffer_load_ubyte
145 ; SI: buffer_load_ubyte
146 ; SI: buffer_load_ubyte
147 ; SI: buffer_store_byte
148 ; SI: buffer_store_byte
149 ; SI: buffer_store_byte
150 ; SI: buffer_store_byte
151 ; SI: s_endpgm
150 ; GCN: buffer_load_ubyte
151 ; GCN: buffer_load_ubyte
152 ; GCN: buffer_load_ubyte
153 ; GCN: buffer_load_ubyte
154 ; GCN: buffer_store_byte
155 ; GCN: buffer_store_byte
156 ; GCN: buffer_store_byte
157 ; GCN: buffer_store_byte
158 ; GCN: s_endpgm
152159 define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
153160 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
154161 store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
9999 ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
100100 ; GCN-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
101101 ; GCN: buffer_store_byte [[RESULT]],
102 ; GCN: s_endpgm
102103 define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
103104 %val = load i8, i8 addrspace(1)* %valptr
104105 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s
22 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
33
44 declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
2929 }
3030
3131 ; GCN-LABEL: {{^}}legacy_cube:
32 ; GCN-DAG: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
33 ; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
34 ; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
35 ; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
32 ; GCN-DAG: v_cubeid_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
33 ; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
34 ; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
35 ; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
3636 ; GCN: buffer_store_dwordx4
3737 define void @legacy_cube(<4 x float> addrspace(1)* %out, <4 x float> %abcx) #1 {
3838 %cube = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %abcx)
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
44 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
55
6 ; SI-LABEL: {{^}}load_i8_to_f32:
7 ; SI: buffer_load_ubyte [[LOADREG:v[0-9]+]],
8 ; SI-NOT: bfe
9 ; SI-NOT: lshr
10 ; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
11 ; SI: buffer_store_dword [[CONV]],
6 ; GCN-LABEL: {{^}}load_i8_to_f32:
7 ; GCN: buffer_load_ubyte [[LOADREG:v[0-9]+]],
8 ; GCN-NOT: bfe
9 ; GCN-NOT: lshr
10 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
11 ; GCN: buffer_store_dword [[CONV]],
1212 define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
1313 %load = load i8, i8 addrspace(1)* %in, align 1
1414 %cvt = uitofp i8 %load to float
1616 ret void
1717 }
1818
19 ; SI-LABEL: {{^}}load_v2i8_to_v2f32:
20 ; SI: buffer_load_ushort [[LD:v[0-9]+]]
21 ; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
22 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
23 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
19 ; GCN-LABEL: {{^}}load_v2i8_to_v2f32:
20 ; GCN: buffer_load_ushort [[LD:v[0-9]+]]
21 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
22 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
23 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
2424 define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
2525 %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
2626 %cvt = uitofp <2 x i8> %load to <2 x float>
2828 ret void
2929 }
3030
31 ; SI-LABEL: {{^}}load_v3i8_to_v3f32:
32 ; SI: buffer_load_dword [[VAL:v[0-9]+]]
33 ; SI-NOT: v_cvt_f32_ubyte3_e32
34 ; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]]
35 ; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
36 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
37 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
31 ; GCN-LABEL: {{^}}load_v3i8_to_v3f32:
32 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
33 ; GCN-NOT: v_cvt_f32_ubyte3_e32
34 ; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]]
35 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
36 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
37 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
3838 define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
3939 %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
4040 %cvt = uitofp <3 x i8> %load to <3 x float>
4242 ret void
4343 }
4444
45 ; SI-LABEL: {{^}}load_v4i8_to_v4f32:
46 ; SI: buffer_load_dword [[LOADREG:v[0-9]+]]
47 ; SI-NOT: bfe
48 ; SI-NOT: lshr
49 ; SI-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
50 ; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]]
51 ; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
52 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
53 ; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
45 ; GCN-LABEL: {{^}}load_v4i8_to_v4f32:
46 ; GCN: buffer_load_dword [[LOADREG:v[0-9]+]]
47 ; GCN-NOT: bfe
48 ; GCN-NOT: lshr
49 ; GCN-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
50 ; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]]
51 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
52 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
53 ; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
5454 define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
5555 %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
5656 %cvt = uitofp <4 x i8> %load to <4 x float>
6262 ; position in the word for the component.
6363
6464 ; FIXME: Packing bytes
65 ; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
66 ; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
67 ; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
68 ; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
69 ; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
70 ; SI-DAG: v_lshlrev_b32
71 ; SI-DAG: v_or_b32
72 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]],
73 ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
74 ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
75 ; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]]
76
77 ; SI: buffer_store_dwordx4
65 ; GCN-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
66 ; GCN: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
67 ; GCN: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
68 ; GCN: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
69 ; GCN: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
70 ; GCN-DAG: v_lshlrev_b32
71 ; GCN-DAG: v_or_b32
72 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]],
73 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
74 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}},
75 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]]
76
77 ; GCN: buffer_store_dwordx4
7878 define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
7979 %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
8080 %cvt = uitofp <4 x i8> %load to <4 x float>
8484
8585 ; FIXME: Need to handle non-uniform case for function below (load without gep).
8686 ; Instructions still emitted to repack bytes for add use.
87 ; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses:
88 ; SI: {{buffer|flat}}_load_dword
89 ; SI-DAG: v_cvt_f32_ubyte0_e32
90 ; SI-DAG: v_cvt_f32_ubyte1_e32
91 ; SI-DAG: v_cvt_f32_ubyte2_e32
92 ; SI-DAG: v_cvt_f32_ubyte3_e32
93
94 ; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24
95 ; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16
87
88 ; GCN-LABEL: {{^}}load_v4i8_to_v4f32_2_uses:
89 ; GCN: {{buffer|flat}}_load_dword
90 ; GCN-DAG: v_cvt_f32_ubyte0_e32
91 ; GCN-DAG: v_cvt_f32_ubyte1_e32
92 ; GCN-DAG: v_cvt_f32_ubyte2_e32
93 ; GCN-DAG: v_cvt_f32_ubyte3_e32
94
95 ; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24
96 ; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16
97
9698 ; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16
9799 ; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 8
98100 ; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
99101 ; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff00,
100102 ; SI-DAG: v_add_i32
101103
102 ; SI: {{buffer|flat}}_store_dwordx4
103 ; SI: {{buffer|flat}}_store_dword
104
105 ; SI: s_endpgm
104 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffffff00,
105 ; VI-DAG: v_add_u16_e32
106 ; VI-DAG: v_add_u16_e32
107
108 ; GCN: {{buffer|flat}}_store_dwordx4
109 ; GCN: {{buffer|flat}}_store_dword
110
111 ; GCN: s_endpgm
106112 define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
107113 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
108114 %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
115121 }
116122
117123 ; Make sure this doesn't crash.
118 ; SI-LABEL: {{^}}load_v7i8_to_v7f32:
119 ; SI: s_endpgm
124 ; GCN-LABEL: {{^}}load_v7i8_to_v7f32:
125 ; GCN: s_endpgm
120126 define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
121127 %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
122128 %cvt = uitofp <7 x i8> %load to <7 x float>
124130 ret void
125131 }
126132
127 ; SI-LABEL: {{^}}load_v8i8_to_v8f32:
128 ; SI: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
129 ; SI-NOT: bfe
130 ; SI-NOT: lshr
131 ; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
132 ; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]]
133 ; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]]
134 ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]]
135 ; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]]
136 ; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]]
137 ; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]]
138 ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
139 ; SI-NOT: bfe
140 ; SI-NOT: lshr
141 ; SI: buffer_store_dwordx4
142 ; SI: buffer_store_dwordx4
133 ; GCN-LABEL: {{^}}load_v8i8_to_v8f32:
134 ; GCN: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
135 ; GCN-NOT: bfe
136 ; GCN-NOT: lshr
137 ; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
138 ; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]]
139 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]]
140 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]]
141 ; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]]
142 ; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]]
143 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]]
144 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
145 ; GCN-NOT: bfe
146 ; GCN-NOT: lshr
147 ; GCN: buffer_store_dwordx4
148 ; GCN: buffer_store_dwordx4
143149 define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
144150 %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
145151 %cvt = uitofp <8 x i8> %load to <8 x float>
147153 ret void
148154 }
149155
150 ; SI-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
151 ; SI: buffer_load_dword [[LOADREG:v[0-9]+]],
152 ; SI: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
153 ; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
154 ; SI: buffer_store_dword [[CONV]],
156 ; GCN-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
157 ; GCN: buffer_load_dword [[LOADREG:v[0-9]+]],
158 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
159 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
160 ; GCN: buffer_store_dword [[CONV]],
155161 define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
156162 %load = load i32, i32 addrspace(1)* %in, align 4
157163 %add = add i32 %load, 2
161167 ret void
162168 }
163169
164 ; SI-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
170 ; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
165171 define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
166172 %load = load i32, i32 addrspace(1)* %in, align 4
167173 %inreg = and i32 %load, 65280
173179
174180 ; We don't get these ones because of the zext, but instcombine removes
175181 ; them so it shouldn't really matter.
176 ; SI-LABEL: {{^}}i8_zext_i32_to_f32:
182 ; GCN-LABEL: {{^}}i8_zext_i32_to_f32:
177183 define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
178184 %load = load i8, i8 addrspace(1)* %in, align 1
179185 %ext = zext i8 %load to i32
182188 ret void
183189 }
184190
185 ; SI-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
191 ; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
186192 define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
187193 %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
188194 %ext = zext <4 x i8> %load to <4 x i32>
191197 ret void
192198 }
193199
194 ; SI-LABEL: {{^}}extract_byte0_to_f32:
195 ; SI: buffer_load_dword [[VAL:v[0-9]+]]
196 ; SI-NOT: [[VAL]]
197 ; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
198 ; SI: buffer_store_dword [[CONV]]
200 ; GCN-LABEL: {{^}}extract_byte0_to_f32:
201 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
202 ; GCN-NOT: [[VAL]]
203 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
204 ; GCN: buffer_store_dword [[CONV]]
199205 define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
200206 %val = load i32, i32 addrspace(1)* %in
201207 %and = and i32 %val, 255
204210 ret void
205211 }
206212
207 ; SI-LABEL: {{^}}extract_byte1_to_f32:
208 ; SI: buffer_load_dword [[VAL:v[0-9]+]]
209 ; SI-NOT: [[VAL]]
210 ; SI: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
211 ; SI: buffer_store_dword [[CONV]]
213 ; GCN-LABEL: {{^}}extract_byte1_to_f32:
214 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
215 ; GCN-NOT: [[VAL]]
216 ; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
217 ; GCN: buffer_store_dword [[CONV]]
212218 define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
213219 %val = load i32, i32 addrspace(1)* %in
214220 %srl = lshr i32 %val, 8
218224 ret void
219225 }
220226
221 ; SI-LABEL: {{^}}extract_byte2_to_f32:
222 ; SI: buffer_load_dword [[VAL:v[0-9]+]]
223 ; SI-NOT: [[VAL]]
224 ; SI: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
225 ; SI: buffer_store_dword [[CONV]]
227 ; GCN-LABEL: {{^}}extract_byte2_to_f32:
228 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
229 ; GCN-NOT: [[VAL]]
230 ; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
231 ; GCN: buffer_store_dword [[CONV]]
226232 define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
227233 %val = load i32, i32 addrspace(1)* %in
228234 %srl = lshr i32 %val, 16
232238 ret void
233239 }
234240
235 ; SI-LABEL: {{^}}extract_byte3_to_f32:
236 ; SI: buffer_load_dword [[VAL:v[0-9]+]]
237 ; SI-NOT: [[VAL]]
238 ; SI: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
239 ; SI: buffer_store_dword [[CONV]]
241 ; GCN-LABEL: {{^}}extract_byte3_to_f32:
242 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
243 ; GCN-NOT: [[VAL]]
244 ; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
245 ; GCN: buffer_store_dword [[CONV]]
240246 define void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
241247 %val = load i32, i32 addrspace(1)* %in
242248 %srl = lshr i32 %val, 24
0 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
3 ; FIXME: cypress is broken because the bigger testcases spill and it's not implemented
4
5 ; FUNC-LABEL: {{^}}zextload_global_i16_to_i32:
6 ; SI: buffer_load_ushort
7 ; SI: buffer_store_dword
8 ; SI: s_endpgm
9 define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
10 %a = load i16, i16 addrspace(1)* %in
11 %ext = zext i16 %a to i32
12 store i32 %ext, i32 addrspace(1)* %out
13 ret void
14 }
15
16 ; FUNC-LABEL: {{^}}sextload_global_i16_to_i32:
17 ; SI: buffer_load_sshort
18 ; SI: buffer_store_dword
19 ; SI: s_endpgm
20 define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
21 %a = load i16, i16 addrspace(1)* %in
22 %ext = sext i16 %a to i32
23 store i32 %ext, i32 addrspace(1)* %out
24 ret void
25 }
26
27 ; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32:
28 ; SI: buffer_load_ushort
29 ; SI: s_endpgm
30 define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
31 %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
32 %ext = zext <1 x i16> %load to <1 x i32>
33 store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
34 ret void
35 }
36
37 ; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32:
38 ; SI: buffer_load_sshort
39 ; SI: s_endpgm
40 define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
41 %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
42 %ext = sext <1 x i16> %load to <1 x i32>
43 store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
44 ret void
45 }
46
47 ; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32:
48 ; SI: s_endpgm
49 define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
50 %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
51 %ext = zext <2 x i16> %load to <2 x i32>
52 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
53 ret void
54 }
55
56 ; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32:
57 ; SI: s_endpgm
58 define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
59 %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
60 %ext = sext <2 x i16> %load to <2 x i32>
61 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
62 ret void
63 }
64
65 ; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32:
66 ; SI: s_endpgm
67 define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
68 %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
69 %ext = zext <4 x i16> %load to <4 x i32>
70 store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
71 ret void
72 }
73
74 ; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32:
75 ; SI: s_endpgm
76 define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
77 %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
78 %ext = sext <4 x i16> %load to <4 x i32>
79 store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
80 ret void
81 }
82
83 ; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32:
84 ; SI: s_endpgm
85 define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
86 %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
87 %ext = zext <8 x i16> %load to <8 x i32>
88 store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
89 ret void
90 }
91
92 ; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32:
93 ; SI: s_endpgm
94 define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
95 %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
96 %ext = sext <8 x i16> %load to <8 x i32>
97 store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
98 ret void
99 }
100
101 ; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32:
102 ; SI: s_endpgm
103 define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
104 %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
105 %ext = zext <16 x i16> %load to <16 x i32>
106 store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
107 ret void
108 }
109
110 ; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32:
111 ; SI: s_endpgm
112 define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
113 %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
114 %ext = sext <16 x i16> %load to <16 x i32>
115 store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
116 ret void
117 }
118
119 ; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32:
120 ; SI: s_endpgm
121 define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
122 %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
123 %ext = zext <32 x i16> %load to <32 x i32>
124 store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
125 ret void
126 }
127
128 ; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32:
129 ; SI: s_endpgm
130 define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
131 %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
132 %ext = sext <32 x i16> %load to <32 x i32>
133 store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
134 ret void
135 }
136
137 ; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32:
138 ; SI: s_endpgm
139 define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
140 %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
141 %ext = zext <64 x i16> %load to <64 x i32>
142 store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
143 ret void
144 }
145
146 ; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32:
147 ; SI: s_endpgm
148 define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
149 %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
150 %ext = sext <64 x i16> %load to <64 x i32>
151 store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
152 ret void
153 }
154
155 ; FUNC-LABEL: {{^}}zextload_global_i16_to_i64:
156 ; SI-DAG: buffer_load_ushort v[[LO:[0-9]+]],
157 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
158 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
159 define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
160 %a = load i16, i16 addrspace(1)* %in
161 %ext = zext i16 %a to i64
162 store i64 %ext, i64 addrspace(1)* %out
163 ret void
164 }
165
166 ; FUNC-LABEL: {{^}}sextload_global_i16_to_i64:
167 ; VI: buffer_load_ushort [[LOAD:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0
168 ; VI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
169 ; VI: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0
170 define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
171 %a = load i16, i16 addrspace(1)* %in
172 %ext = sext i16 %a to i64
173 store i64 %ext, i64 addrspace(1)* %out
174 ret void
175 }
176
177 ; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64:
178 ; SI: s_endpgm
179 define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
180 %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
181 %ext = zext <1 x i16> %load to <1 x i64>
182 store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
183 ret void
184 }
185
186 ; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64:
187 ; SI: s_endpgm
188 define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
189 %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
190 %ext = sext <1 x i16> %load to <1 x i64>
191 store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
192 ret void
193 }
194
195 ; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64:
196 ; SI: s_endpgm
197 define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
198 %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
199 %ext = zext <2 x i16> %load to <2 x i64>
200 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
201 ret void
202 }
203
204 ; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64:
205 ; SI: s_endpgm
206 define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
207 %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
208 %ext = sext <2 x i16> %load to <2 x i64>
209 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
210 ret void
211 }
212
213 ; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64:
214 ; SI: s_endpgm
215 define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
216 %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
217 %ext = zext <4 x i16> %load to <4 x i64>
218 store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
219 ret void
220 }
221
222 ; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64:
223 ; SI: s_endpgm
224 define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
225 %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
226 %ext = sext <4 x i16> %load to <4 x i64>
227 store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
228 ret void
229 }
230
231 ; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64:
232 ; SI: s_endpgm
233 define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
234 %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
235 %ext = zext <8 x i16> %load to <8 x i64>
236 store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
237 ret void
238 }
239
240 ; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64:
241 ; SI: s_endpgm
242 define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
243 %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
244 %ext = sext <8 x i16> %load to <8 x i64>
245 store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
246 ret void
247 }
248
249 ; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64:
250 ; SI: s_endpgm
251 define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
252 %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
253 %ext = zext <16 x i16> %load to <16 x i64>
254 store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
255 ret void
256 }
257
258 ; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64:
259 ; SI: s_endpgm
260 define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
261 %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
262 %ext = sext <16 x i16> %load to <16 x i64>
263 store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
264 ret void
265 }
266
267 ; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64:
268 ; SI: s_endpgm
269 define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
270 %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
271 %ext = zext <32 x i16> %load to <32 x i64>
272 store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
273 ret void
274 }
275
276 ; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64:
277 ; SI: s_endpgm
278 define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
279 %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
280 %ext = sext <32 x i16> %load to <32 x i64>
281 store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
282 ret void
283 }
284
285 ; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64:
286 ; SI: s_endpgm
287 define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
288 %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
289 %ext = zext <64 x i16> %load to <64 x i64>
290 store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
291 ret void
292 }
293
294 ; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64:
295 ; SI: s_endpgm
296 define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
297 %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
298 %ext = sext <64 x i16> %load to <64 x i64>
299 store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
300 ret void
301 }
378378
379379 ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64:
380380
381 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
382 ; GCN-DAG: v_cvt_f32_f16_e32
383 ; GCN-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
384 ; GCN-DAG: v_cvt_f32_f16_e32
385 ; GCN-DAG: v_cvt_f32_f16_e32
386
387 ; GCN: v_cvt_f64_f32_e32
388 ; GCN: v_cvt_f64_f32_e32
389 ; GCN: v_cvt_f64_f32_e32
381 ; XSI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
382 ; XSI: v_cvt_f32_f16_e32
383 ; XSI: v_cvt_f32_f16_e32
384 ; XSI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
385 ; XSI: v_cvt_f32_f16_e32
386 ; XSI-NOT: v_cvt_f32_f16
387
388 ; XVI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
389 ; XVI: v_cvt_f32_f16_e32
390 ; XVI: v_cvt_f32_f16_e32
391 ; XVI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
392 ; XVI: v_cvt_f32_f16_e32
393 ; XVI-NOT: v_cvt_f32_f16
394
395 ; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]
396 ; GCN: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]]
397 ; GCN: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]]
398 ; GCN: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
399 ; GCN: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]]
400
401 ; GCN: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]]
402 ; GCN: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]]
403 ; GCN: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]]
390404 ; GCN-NOT: v_cvt_f64_f32_e32
391405
392 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
393 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
406 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[XLO]]:[[YHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
407 ; GCN-DAG: buffer_store_dwordx2 [[Z]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
394408 ; GCN: s_endpgm
395409 define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
396410 %val = load <3 x half>, <3 x half> addrspace(1)* %in
None ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s
2 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s
23 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
34
45 declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone
7273 }
7374
7475 ; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8:
75 ; SI: buffer_load_dword
76 ; GCN: buffer_load_dword
7677 ; SI: v_add_i32
7778 ; SI-NEXT: v_and_b32_e32
78 ; SI-NOT: {{[^@]}}bfe
79 ; SI: s_endpgm
79 ; FIXME: Should be using s_add_i32
80 ; VI: v_add_i32
81 ; VI-NEXT: v_and_b32_e32
82 ; SI-NOT: {{[^@]}}bfe
83 ; GCN: s_endpgm
8084 define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
8185 %load = load i32, i32 addrspace(1)* %in, align 4
8286 %add = add i32 %load, 1
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
33 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
44
55 ; FUNC-LABEL: {{^}}constant_load_i16:
427427 }
428428
429429 ; FUNC-LABEL: {{^}}constant_sextload_i16_to_i64:
430 ; GCN-NOHSA-DAG: buffer_load_sshort v[[LO:[0-9]+]],
430 ; FIXME: Need to optimize this sequence to avoid extra bfe:
431 ; t28: i32,ch = load t12, t27, undef:i64
432 ; t31: i64 = any_extend t28
433 ; t33: i64 = sign_extend_inreg t31, ValueType:ch:i16
434
435 ; GCN-NOHSA-SI-DAG: buffer_load_sshort v[[LO:[0-9]+]],
431436 ; GCN-HSA-DAG: flat_load_sshort v[[LO:[0-9]+]],
437 ; GCN-NOHSA-VI-DAG: buffer_load_ushort v[[ULO:[0-9]+]],
438 ; GCN-NOHSA-VI-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
432439 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
433440
434441 ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
33 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
44 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
55
443443 }
444444
445445 ; FUNC-LABEL: {{^}}global_sextload_i16_to_i64:
446 ; GCN-NOHSA-DAG: buffer_load_sshort v[[LO:[0-9]+]],
446 ; FIXME: Need to optimize this sequence to avoid extra bfe:
447 ; t28: i32,ch = load t12, t27, undef:i64
448 ; t31: i64 = any_extend t28
449 ; t33: i64 = sign_extend_inreg t31, ValueType:ch:i16
450
451 ; GCN-NOHSA-SI-DAG: buffer_load_sshort v[[LO:[0-9]+]],
447452 ; GCN-HSA-DAG: flat_load_sshort v[[LO:[0-9]+]],
453 ; GCN-NOHSA-VI-DAG: buffer_load_ushort v[[ULO:[0-9]+]],
454 ; GCN-NOHSA-VI-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
448455 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
449456
450457 ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s
33 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
44 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
55
162162 ; GCN-NOHSA: buffer_load_dword v
163163 ; GCN-HSA: flat_load_dword v
164164
165 ; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
165 ; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
166 ; VI-DAG: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
166167 ; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8
167168 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,
168169
184185 ; GCN-NOHSA: buffer_load_dword v
185186 ; GCN-HSA: flat_load_dword v
186187
187 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
188 ;FIXME: Need to optimize this sequence to avoid extra shift on VI.
189
190 ; t23: i16 = truncate t18
191 ; t49: i16 = srl t23, Constant:i32<8>
192 ; t57: i32 = any_extend t49
193 ; t58: i32 = sign_extend_inreg t57, ValueType:ch:i8
194
195 ; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
196 ; VI-DAG: v_lshrrev_b16_e32 [[SHIFT:v[0-9]+]], 8, v{{[0-9]+}}
197 ; VI-DAG: v_bfe_i32 v{{[0-9]+}}, [[SHIFT]], 0, 8
188198 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
189199 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8
190200
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
22 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
33
44 ; FUNC-LABEL: {{^}}local_load_i16:
538538 }
539539
540540 ; FUNC-LABEL: {{^}}local_sextload_i16_to_i64:
541 ; GCN: ds_read_i16 v[[LO:[0-9]+]],
541 ; FIXME: Need to optimize this sequence to avoid an extra shift.
542 ; t25: i32,ch = load t12, t10, undef:i32
543 ; t28: i64 = any_extend t25
544 ; t30: i64 = sign_extend_inreg t28, ValueType:ch:i16
545 ; SI: ds_read_i16 v[[LO:[0-9]+]],
546 ; VI: ds_read_u16 v[[ULO:[0-9]+]]
547 ; VI: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
542548 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
543549
544550 ; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
22 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
33
44
140140 ; GCN-NOT: s_wqm_b64
141141 ; GCN: s_mov_b32 m0
142142 ; GCN: ds_read_u16
143 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
144 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
143 ; FIXME: Need to optimize this sequence to avoid extra shift on VI.
144 ; t23: i16 = srl t39, Constant:i32<8>
145 ; t31: i32 = any_extend t23
146 ; t33: i32 = sign_extend_inreg t31, ValueType:ch:i8
147
148 ; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
149 ; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
150
151 ; VI-DAG: v_lshrrev_b16_e32 [[SHIFT:v[0-9]+]], 8, v{{[0-9]+}}
152 ; VI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
153 ; VI-DAG: v_bfe_i32 v{{[0-9]+}}, [[SHIFT]], 0, 8
145154
146155 ; EG: LDS_USHORT_READ_RET
147156 ; EG-DAG: BFE_INT
156165 ; FUNC-LABEL: {{^}}local_zextload_v3i8_to_v3i32:
157166 ; GCN: ds_read_b32
158167
159 ; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
168 ; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
169 ; VI-DAG: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, {{v[0-9]+}}
160170 ; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8
161171 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,
162172
None ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
1 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
2 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
31 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
2 ; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
3 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC
4 ; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC
5
6 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
47
58 ; FUNC-LABEL: {{^}}u32_mad24:
69 ; EG: MULADD_UINT24
710 ; SI: v_mad_u32_u24
11 ; VI: v_mad_u32_u24
812
913 define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
1014 entry:
2428 ; The result must be sign-extended
2529 ; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
2630 ; EG: 16
27 ; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
28 ; SI: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16
29
31 ; FIXME: Should be using scalar instructions here.
32 ; GCN: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
33 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16
3034 define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
3135 entry:
3236 %0 = mul i16 %a, %b
3640 ret void
3741 }
3842
43 ; FIXME: Need to handle non-uniform case for function below (load without gep).
3944 ; FUNC-LABEL: {{^}}i8_mad24:
4045 ; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
4146 ; The result must be sign-extended
4247 ; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
4348 ; EG: 8
44 ; SI: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
45 ; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
46
49 ; GCN: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
50 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
4751 define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
4852 entry:
4953 %0 = mul i8 %a, %b
0 ; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s
1
2
3 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
4
5 ; FIXME: Need to handle non-uniform case for function below (load without gep).
6 ; GCN-LABEL: {{^}}v_test_imax_sge_i16:
7 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
8 define void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
9 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
10 %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
11 %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
12 %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
13 %a = load i16, i16 addrspace(1)* %gep0, align 4
14 %b = load i16, i16 addrspace(1)* %gep1, align 4
15 %cmp = icmp sge i16 %a, %b
16 %val = select i1 %cmp, i16 %a, i16 %b
17 store i16 %val, i16 addrspace(1)* %outgep, align 4
18 ret void
19 }
20
21 ; FIXME: Need to handle non-uniform case for function below (load without gep).
22 ; GCN-LABEL: {{^}}v_test_imax_sge_v4i16:
23 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
24 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
25 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
26 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
27 define void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind {
28 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
29 %gep0 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %aptr, i32 %tid
30 %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %bptr, i32 %tid
31 %outgep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid
32 %a = load <4 x i16>, <4 x i16> addrspace(1)* %gep0, align 4
33 %b = load <4 x i16>, <4 x i16> addrspace(1)* %gep1, align 4
34 %cmp = icmp sge <4 x i16> %a, %b
35 %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
36 store <4 x i16> %val, <4 x i16> addrspace(1)* %outgep, align 4
37 ret void
38 }
39
40 ; FIXME: Need to handle non-uniform case for function below (load without gep).
41 ; GCN-LABEL: {{^}}v_test_imax_sgt_i16:
42 ; VI: v_max_i16_e32
43 define void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
44 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
45 %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
46 %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
47 %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
48 %a = load i16, i16 addrspace(1)* %gep0, align 4
49 %b = load i16, i16 addrspace(1)* %gep1, align 4
50 %cmp = icmp sgt i16 %a, %b
51 %val = select i1 %cmp, i16 %a, i16 %b
52 store i16 %val, i16 addrspace(1)* %outgep, align 4
53 ret void
54 }
55
56 ; FIXME: Need to handle non-uniform case for function below (load without gep).
57 ; GCN-LABEL: {{^}}v_test_umax_uge_i16:
58 ; VI: v_max_u16_e32
59 define void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
60 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
61 %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
62 %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
63 %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
64 %a = load i16, i16 addrspace(1)* %gep0, align 4
65 %b = load i16, i16 addrspace(1)* %gep1, align 4
66 %cmp = icmp uge i16 %a, %b
67 %val = select i1 %cmp, i16 %a, i16 %b
68 store i16 %val, i16 addrspace(1)* %outgep, align 4
69 ret void
70 }
71
72 ; FIXME: Need to handle non-uniform case for function below (load without gep).
73 ; GCN-LABEL: {{^}}v_test_umax_ugt_i16:
74 ; VI: v_max_u16_e32
75 define void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
76 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
77 %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
78 %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
79 %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
80 %a = load i16, i16 addrspace(1)* %gep0, align 4
81 %b = load i16, i16 addrspace(1)* %gep1, align 4
82 %cmp = icmp ugt i16 %a, %b
83 %val = select i1 %cmp, i16 %a, i16 %b
84 store i16 %val, i16 addrspace(1)* %outgep, align 4
85 ret void
86 }
3030 }
3131
3232 ; FUNC-LABEL: {{^}}test_umul24_i16_vgpr_sext:
33 ; GCN: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
33 ; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
34 ; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
3435 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
3536 define void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
3637 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
6162 }
6263
6364 ; FUNC-LABEL: {{^}}test_umul24_i16_vgpr:
64 ; GCN: v_mul_u32_u24_e32
65 ; GCN: v_and_b32_e32
65 ; SI: v_mul_u32_u24_e32
66 ; SI: v_and_b32_e32
67 ; VI: v_mul_lo_u16
6668 define void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
6769 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
6870 %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
7678 ret void
7779 }
7880
79 ; FIXME: Need to handle non-uniform case for function below (load without gep).
8081 ; FUNC-LABEL: {{^}}test_umul24_i8_vgpr:
81 ; GCN: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
82 ; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
83 ; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
8284 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
8385 define void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) {
8486 entry:
4949 %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
5050 %result = shl <4 x i32> %a, %b
5151 store <4 x i32> %result, <4 x i32> addrspace(1)* %out
52 ret void
53 }
54
55 ;VI: {{^}}shl_i16:
56 ;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}}
57
58 define void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
59 %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
60 %a = load i16, i16 addrspace(1) * %in
61 %b = load i16, i16 addrspace(1) * %b_ptr
62 %result = shl i16 %a, %b
63 store i16 %result, i16 addrspace(1)* %out
64 ret void
65 }
66
67
68 ;VI: {{^}}shl_v2i16:
69 ;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}}
70 ;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}}
71
72 define void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
73 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
74 %a = load <2 x i16>, <2 x i16> addrspace(1) * %in
75 %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr
76 %result = shl <2 x i16> %a, %b
77 store <2 x i16> %result, <2 x i16> addrspace(1)* %out
78 ret void
79 }
80
81
82 ;VI: {{^}}shl_v4i16:
83 ;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}}
84 ;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}}
85 ;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}}
86 ;VI: v_lshlrev_b16_e32 v{{[0-9]+, [0-9]+, [0-9]+}}
87
88 define void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
89 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
90 %a = load <4 x i16>, <4 x i16> addrspace(1) * %in
91 %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr
92 %result = shl <4 x i16> %a, %b
93 store <4 x i16> %result, <4 x i16> addrspace(1)* %out
5294 ret void
5395 }
5496
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
22
33 ; GCN-LABEL: {{^}}s_sext_i1_to_i32:
44 ; GCN: v_cndmask_b32_e64
5454 }
5555
5656 ; GCN-LABEL: {{^}}s_sext_i16_to_i64:
57 ; GCN: s_endpgm
57 ; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000
5858 define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
5959 %sext = sext i16 %a to i64
6060 store i64 %sext, i64 addrspace(1)* %out, align 8
6161 ret void
6262 }
6363
64 ; GCN-LABEL: {{^}}s_sext_i1_to_i16:
65 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
66 ; GCN-NEXT: buffer_store_short [[RESULT]]
67 define void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
68 %cmp = icmp eq i32 %a, %b
69 %sext = sext i1 %cmp to i16
70 store i16 %sext, i16 addrspace(1)* %out
71 ret void
72 }
73
6474 ; GCN-LABEL: {{^}}s_sext_v4i8_to_v4i32:
6575 ; GCN: s_load_dword [[VAL:s[0-9]+]]
66 ; GCN-DAG: s_sext_i32_i8 [[EXT0:s[0-9]+]], [[VAL]]
67 ; GCN-DAG: s_bfe_i32 [[EXT1:s[0-9]+]], [[VAL]], 0x80008
6876 ; GCN-DAG: s_bfe_i32 [[EXT2:s[0-9]+]], [[VAL]], 0x80010
6977 ; GCN-DAG: s_ashr_i32 [[EXT3:s[0-9]+]], [[VAL]], 24
78 ; SI-DAG: s_bfe_i32 [[EXT1:s[0-9]+]], [[VAL]], 0x80008
79 ; GCN-DAG: s_sext_i32_i8 [[EXT0:s[0-9]+]], [[VAL]]
80
81 ; FIXME: We end up with a v_bfe instruction, because the i16 srl
82 ; gets selected to a v_lshrrev_b16 instructions, so the input to
83 ; the bfe is a vector registers. To fix this we need to be able to
84 ; optimize:
85 ; t29: i16 = truncate t10
86 ; t55: i16 = srl t29, Constant:i32<8>
87 ; t63: i32 = any_extend t55
88 ; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8
89
90 ; VI-DAG: v_bfe_i32 [[VEXT1:v[0-9]+]], v{{[0-9]+}}, 0, 8
7091
7192 ; GCN-DAG: v_mov_b32_e32 [[VEXT0:v[0-9]+]], [[EXT0]]
72 ; GCN-DAG: v_mov_b32_e32 [[VEXT1:v[0-9]+]], [[EXT1]]
93 ; SI-DAG: v_mov_b32_e32 [[VEXT1:v[0-9]+]], [[EXT1]]
7394 ; GCN-DAG: v_mov_b32_e32 [[VEXT2:v[0-9]+]], [[EXT2]]
7495 ; GCN-DAG: v_mov_b32_e32 [[VEXT3:v[0-9]+]], [[EXT3]]
7596
95116
96117 ; GCN-LABEL: {{^}}v_sext_v4i8_to_v4i32:
97118 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
98 ; GCN-DAG: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8
99 ; GCN-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[VAL]], 8, 8
100 ; GCN-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8
119 ; FIXME: need to optimize same sequence as above test to avoid
120 ; this shift.
121 ; VI-DAG: v_lshrrev_b16_e32 [[SH16:v[0-9]+]], 8, [[VAL]]
101122 ; GCN-DAG: v_ashrrev_i32_e32 [[EXT3:v[0-9]+]], 24, [[VAL]]
123 ; VI-DAG: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8
124 ; VI-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8
125 ; VI-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[SH16]], 0, 8
126
127 ; SI-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8
128 ; SI-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[VAL]], 8, 8
129 ; SI: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8
102130
103131 ; GCN: buffer_store_dword [[EXT0]]
104132 ; GCN: buffer_store_dword [[EXT1]]
4242 %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
4343 %result = ashr <4 x i32> %a, %b
4444 store <4 x i32> %result, <4 x i32> addrspace(1)* %out
45 ret void
46 }
47
48 ; FUNC-LABEL: {{^}}ashr_v2i16:
49 ; FIXME: The ashr operation is uniform, but because its operands come from a
50 ; global load we end up with the vector instructions rather than scalar.
51 ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
52 ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
53 define void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
54 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
55 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in
56 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
57 %result = ashr <2 x i16> %a, %b
58 store <2 x i16> %result, <2 x i16> addrspace(1)* %out
59 ret void
60 }
61
62 ; FUNC-LABEL: {{^}}ashr_v4i16:
63 ; FIXME: The ashr operation is uniform, but because its operands come from a
64 ; global load we end up with the vector instructions rather than scalar.
65 ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
66 ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
67 ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
68 ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
69 define void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
70 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
71 %a = load <4 x i16>, <4 x i16> addrspace(1)* %in
72 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
73 %result = ashr <4 x i16> %a, %b
74 store <4 x i16> %result, <4 x i16> addrspace(1)* %out
4575 ret void
4676 }
4777
5151 %result = sub <4 x i32> %a, %b
5252 store <4 x i32> %result, <4 x i32> addrspace(1)* %out
5353 ret void
54 }
55
56 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
57 define void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
58 %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
59 %a = load i16, i16 addrspace(1)* %in
60 %b = load i16, i16 addrspace(1)* %b_ptr
61 %result = sub i16 %a, %b
62 store i16 %result, i16 addrspace(1)* %out
63 ret void
64 }
65
66 ; FUNC-LABEL: {{^}}test_sub_v2i16:
67
68 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
69 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
70
71 define void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
72 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
73 %a = load <2 x i16>, <2 x i16> addrspace(1) * %in
74 %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr
75 %result = sub <2 x i16> %a, %b
76 store <2 x i16> %result, <2 x i16> addrspace(1)* %out
77 ret void
78 }
79
80 ; FUNC-LABEL: {{^}}test_sub_v4i16:
81
82 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
83 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
84 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
85 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
86
87 define void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
88 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
89 %a = load <4 x i16>, <4 x i16> addrspace(1) * %in
90 %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr
91 %result = sub <4 x i16> %a, %b
92 store <4 x i16> %result, <4 x i16> addrspace(1)* %out
93 ret void
5494 }
5595
5696 ; FUNC-LABEL: {{^}}s_sub_i64:
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
22
33 ; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32:
44 ; CHECK: buffer_load_dword v
4646 }
4747
4848 ; CHECK-LABEL: {{^}}trunc_i16_bitcast_v4i16:
49 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
49 ; FIXME We need to teach the dagcombiner to reduce load width for:
50 ; t21: v2i32,ch = load t12, t10, undef:i64
51 ; t23: i64 = bitcast t21
52 ; t30: i16 = truncate t23
53 ; SI: buffer_load_dword v[[VAL:[0-9]+]]
54 ; VI: buffer_load_dwordx2 v{{\[}}[[VAL:[0-9]+]]
5055 ; CHECK: buffer_store_short [[VAL]]
5156 define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
5257 %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
2020 ret void
2121 }
2222
23 ; SI-LABEL: {{^}}global_truncstore_i16_to_i1:
23 ; SI-LABEL: {{^}}s_arg_global_truncstore_i16_to_i1:
2424 ; SI: s_load_dword [[LOAD:s[0-9]+]],
2525 ; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
2626 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
2727 ; SI: buffer_store_byte [[VREG]],
28 define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
28 define void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
2929 %trunc = trunc i16 %val to i1
3030 store i1 %trunc, i1 addrspace(1)* %out, align 1
3131 ret void
3232 }
33 ; SI-LABEL: {{^}}global_truncstore_i16_to_i1:
34 define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind {
35 %add = add i16 %val0, %val1
36 %trunc = trunc i16 %add to i1
37 store i1 %trunc, i1 addrspace(1)* %out, align 1
38 ret void
39 }
11 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI
22 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
33
4 ; R600: {{^}}test:
4 ; R600: {{^}}s_mad_zext_i32_to_i64:
55 ; R600: MEM_RAT_CACHELESS STORE_RAW
66 ; R600: MEM_RAT_CACHELESS STORE_RAW
77
8 ; SI: {{^}}test:
8 ; SI: {{^}}s_mad_zext_i32_to_i64:
99 ; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}}
1010 ; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
11 define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
11 define void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 {
1212 entry:
13 %0 = mul i32 %a, %b
14 %1 = add i32 %0, %c
15 %2 = zext i32 %1 to i64
16 store i64 %2, i64 addrspace(1)* %out
13 %tmp0 = mul i32 %a, %b
14 %tmp1 = add i32 %tmp0, %c
15 %tmp2 = zext i32 %tmp1 to i64
16 store i64 %tmp2, i64 addrspace(1)* %out
1717 ret void
1818 }
1919
20 ; SI-LABEL: {{^}}testi1toi32:
20 ; SI-LABEL: {{^}}s_cmp_zext_i1_to_i32
2121 ; SI: v_cndmask_b32
22 define void @testi1toi32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
22 define void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
2323 entry:
24 %0 = icmp eq i32 %a, %b
25 %1 = zext i1 %0 to i32
26 store i32 %1, i32 addrspace(1)* %out
24 %tmp0 = icmp eq i32 %a, %b
25 %tmp1 = zext i1 %tmp0 to i32
26 store i32 %tmp1, i32 addrspace(1)* %out
2727 ret void
2828 }
2929
30 ; SI-LABEL: {{^}}zext_i1_to_i64:
30 ; SI-LABEL: {{^}}s_arg_zext_i1_to_i64:
31 define void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 {
32 %ext = zext i1 %arg to i64
33 store i64 %ext, i64 addrspace(1)* %out, align 8
34 ret void
35 }
36
37 ; SI-LABEL: {{^}}s_cmp_zext_i1_to_i64:
3138 ; SI: s_mov_b32 s{{[0-9]+}}, 0
3239 ; SI: v_cmp_eq_u32
3340 ; SI: v_cndmask_b32
34 define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
41 define void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
3542 %cmp = icmp eq i32 %a, %b
3643 %ext = zext i1 %cmp to i64
3744 store i64 %ext, i64 addrspace(1)* %out, align 8
3845 ret void
3946 }
47
48 ; SI-LABEL: {{^}}s_cmp_zext_i1_to_i16
49 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
50 ; SI: buffer_store_short [[RESULT]]
51 define void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
52 %tmp0 = icmp eq i16 %a, %b
53 %tmp1 = zext i1 %tmp0 to i16
54 store i16 %tmp1, i16 addrspace(1)* %out
55 ret void
56 }
57
58 attributes #0 = { nounwind }