llvm.org GIT mirror llvm / 27f4f2f
AMDGPU: Support v2i16/v2f16 packed operations git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@296396 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 2 years ago
37 changed file(s) with 3640 addition(s) and 415 deletion(s). Raw diff Collapse all Expand all
180180 }
181181
182182 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
183 if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 &&
184 T->getIntegerBitWidth() <= 16)
183 const IntegerType *IntTy = dyn_cast(T);
184 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
185185 return true;
186 if (!T->isVectorTy())
187 return false;
188 return needsPromotionToI32(cast(T)->getElementType());
186
187 if (const VectorType *VT = dyn_cast(T)) {
188 // TODO: The set of packed operations is more limited, so may want to
189 // promote some anyway.
190 if (ST->hasVOP3PInsts())
191 return false;
192
193 return needsPromotionToI32(VT->getElementType());
194 }
195
196 return false;
189197 }
190198
191199 // Return true if the op promoted to i32 should have nsw set.
158158 SDValue &Clamp,
159159 SDValue &Omod) const;
160160
161 bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
162 bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
163 SDValue &Clamp) const;
164
161165 void SelectADD_SUB_I64(SDNode *N);
162166 void SelectUADDO_USUBO(SDNode *N);
163167 void SelectDIV_SCALE(SDNode *N);
302306 }
303307
304308 llvm_unreachable("invalid vector size");
309 }
310
311 static bool getConstantValue(SDValue N, uint32_t &Out) {
312 if (const ConstantSDNode *C = dyn_cast(N)) {
313 Out = C->getAPIntValue().getZExtValue();
314 return true;
315 }
316
317 if (const ConstantFPSDNode *C = dyn_cast(N)) {
318 Out = C->getValueAPF().bitcastToAPInt().getZExtValue();
319 return true;
320 }
321
322 return false;
305323 }
306324
307325 void AMDGPUDAGToDAGISel::Select(SDNode *N) {
355373 EVT VT = N->getValueType(0);
356374 unsigned NumVectorElts = VT.getVectorNumElements();
357375 EVT EltVT = VT.getVectorElementType();
376
377 if (VT == MVT::v2i16 || VT == MVT::v2f16) {
378 if (Opc == ISD::BUILD_VECTOR) {
379 uint32_t LHSVal, RHSVal;
380 if (getConstantValue(N->getOperand(0), LHSVal) &&
381 getConstantValue(N->getOperand(1), RHSVal)) {
382 uint32_t K = LHSVal | (RHSVal << 16);
383 CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT,
384 CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32));
385 return;
386 }
387 }
388
389 break;
390 }
391
358392 assert(EltVT.bitsEq(MVT::i32));
393
359394 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
360395 RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
361396 } else {
15641599 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
15651600 SDValue &SrcMods) const {
15661601 unsigned Mods = 0;
1567
15681602 Src = In;
15691603
15701604 if (Src.getOpcode() == ISD::FNEG) {
15781612 }
15791613
15801614 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
1581
15821615 return true;
15831616 }
15841617
16301663 SDValue &Omod) const {
16311664 Clamp = Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
16321665 return SelectVOP3Mods(In, Src, SrcMods);
1666 }
1667
1668 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
1669 SDValue &SrcMods) const {
1670 unsigned Mods = 0;
1671 Src = In;
1672
1673 // FIXME: Look for on separate components
1674 if (Src.getOpcode() == ISD::FNEG) {
1675 Mods |= (SISrcMods::NEG | SISrcMods::NEG_HI);
1676 Src = Src.getOperand(0);
1677 }
1678
1679 // Packed instructions do not have abs modifiers.
1680
1681 // FIXME: Handle abs/neg of individual components.
1682 // FIXME: Handle swizzling with op_sel
1683 Mods |= SISrcMods::OP_SEL_1;
1684
1685 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
1686 return true;
1687 }
1688
1689 bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src,
1690 SDValue &SrcMods,
1691 SDValue &Clamp) const {
1692 SDLoc SL(In);
1693
1694 // FIXME: Handle clamp and op_sel
1695 Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
1696
1697 return SelectVOP3PMods(In, Src, SrcMods);
16331698 }
16341699
16351700 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
643643
644644 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
645645 assert(VT.isFloatingPoint());
646 return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() &&
647 VT == MVT::f16);
646
647 // Packed operations do not have a fabs modifier.
648 return VT == MVT::f32 || VT == MVT::f64 ||
649 (Subtarget->has16BitInsts() && VT == MVT::f16);
648650 }
649651
650652 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
651 return isFAbsFree(VT);
653 assert(VT.isFloatingPoint());
654 return VT == MVT::f32 || VT == MVT::f64 ||
655 (Subtarget->has16BitInsts() && VT == MVT::f16) ||
656 (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
652657 }
653658
654659 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
7474 // Misc. PatFrags
7575 //===----------------------------------------------------------------------===//
7676
77 class HasOneUseUnaryOp : PatFrag<
78 (ops node:$src0),
79 (op $src0),
80 [{ return N->hasOneUse(); }]
81 >;
82
7783 class HasOneUseBinOp : PatFrag<
7884 (ops node:$src0, node:$src1),
7985 (op $src0, $src1),
8692 [{ return N->hasOneUse(); }]
8793 >;
8894
95 def trunc_oneuse : HasOneUseUnaryOp;
8996
9097 let Properties = [SDNPCommutative, SDNPAssociative] in {
9198 def smax_oneuse : HasOneUseBinOp;
100107 } // Properties = [SDNPCommutative, SDNPAssociative]
101108
102109 def sub_oneuse : HasOneUseBinOp;
110
111 def srl_oneuse : HasOneUseBinOp;
103112 def shl_oneuse : HasOneUseBinOp;
104113
105114 def select_oneuse : HasOneUseTernaryOp
439448 int TWO_PI_INV = 0x3e22f983;
440449 int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding
441450 int FP16_ONE = 0x3C00;
451 int V2FP16_ONE = 0x3C003C00;
442452 int FP32_ONE = 0x3f800000;
443453 int FP32_NEG_ONE = 0xbf800000;
444454 int FP64_ONE = 0x3ff0000000000000;
366366 const TargetRegisterClass *FoldRC =
367367 TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
368368
369 APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType),
370 OpToFold.getImm());
371369
372370 // Split 64-bit constants into 32-bits for folding.
373371 if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
377375 MRI->getRegClass(UseReg) :
378376 TRI->getPhysRegClass(UseReg);
379377
380 assert(Imm.getBitWidth() == 64);
381
382378 if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
383379 return;
384380
381 APInt Imm(64, OpToFold.getImm());
385382 if (UseOp.getSubReg() == AMDGPU::sub0) {
386383 Imm = Imm.getLoBits(32);
387384 } else {
388385 assert(UseOp.getSubReg() == AMDGPU::sub1);
389386 Imm = Imm.getHiBits(32);
390387 }
391 }
392
393 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
394 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
388
389 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
390 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
391 return;
392 }
393
394
395
396 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
395397 }
396398
397399 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
401401 }
402402 }
403403
404 // XXX - Do these do anything? Vector constants turn into build_vector.
405 setOperationAction(ISD::Constant, MVT::v2i16, Legal);
406 setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
407
404408 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
405409 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
406410 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
410414 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
411415 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
412416 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
417
418 setOperationAction(ISD::AND, MVT::v2i16, Promote);
419 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
420 setOperationAction(ISD::OR, MVT::v2i16, Promote);
421 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
422 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
423 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
424 setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
425 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
426 setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
427 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
428
429 setOperationAction(ISD::ADD, MVT::v2i16, Legal);
430 setOperationAction(ISD::SUB, MVT::v2i16, Legal);
431 setOperationAction(ISD::MUL, MVT::v2i16, Legal);
432 setOperationAction(ISD::SHL, MVT::v2i16, Legal);
433 setOperationAction(ISD::SRL, MVT::v2i16, Legal);
434 setOperationAction(ISD::SRA, MVT::v2i16, Legal);
435 setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
436 setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
437 setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
438 setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
439
440 setOperationAction(ISD::FADD, MVT::v2f16, Legal);
441 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
442 setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
443 setOperationAction(ISD::FMA, MVT::v2f16, Legal);
444 setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
445 setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
446
447 // This isn't really legal, but this avoids the legalizer unrolling it (and
448 // allows matching fneg (fabs x) patterns)
449 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
450
451 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
452 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
453
454 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
455 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
456 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
413457 }
414458
415459 setTargetDAGCombine(ISD::FADD);
427471 setTargetDAGCombine(ISD::SINT_TO_FP);
428472 setTargetDAGCombine(ISD::UINT_TO_FP);
429473 setTargetDAGCombine(ISD::FCANONICALIZE);
474 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
430475
431476 // All memory operations. Some folding on the pointer operand is done to help
432477 // matching the constant offsets in the addressing modes.
39644009 SDValue SITargetLowering::performFCanonicalizeCombine(
39654010 SDNode *N,
39664011 DAGCombinerInfo &DCI) const {
3967 ConstantFPSDNode *CFP = dyn_cast(N->getOperand(0));
4012 ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0));
39684013 if (!CFP)
39694014 return SDValue();
39704015
39744019 // Flush denormals to 0 if not enabled.
39754020 if (C.isDenormal()) {
39764021 EVT VT = N->getValueType(0);
3977 if (VT == MVT::f32 && !Subtarget->hasFP32Denormals())
4022 EVT SVT = VT.getScalarType();
4023 if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals())
39784024 return DAG.getConstantFP(0.0, SDLoc(N), VT);
39794025
3980 if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
4026 if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals())
39814027 return DAG.getConstantFP(0.0, SDLoc(N), VT);
39824028
3983 if (VT == MVT::f16 && !Subtarget->hasFP16Denormals())
4029 if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals())
39844030 return DAG.getConstantFP(0.0, SDLoc(N), VT);
39854031 }
39864032
40004046 return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
40014047 }
40024048
4003 return SDValue(CFP, 0);
4049 return N->getOperand(0);
40044050 }
40054051
40064052 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
42694315
42704316 SelectionDAG &DAG = DCI.DAG;
42714317 EVT VT = N->getValueType(0);
4272 assert(!VT.isVector());
42734318
42744319 SDLoc SL(N);
42754320 SDValue LHS = N->getOperand(0);
45084553 return performFMed3Combine(N, DCI);
45094554 case AMDGPUISD::CVT_PKRTZ_F16_F32:
45104555 return performCvtPkRTZCombine(N, DCI);
4556 case ISD::SCALAR_TO_VECTOR: {
4557 SelectionDAG &DAG = DCI.DAG;
4558 EVT VT = N->getValueType(0);
4559
4560 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
4561 if (VT == MVT::v2i16 || VT == MVT::v2f16) {
4562 SDLoc SL(N);
4563 SDValue Src = N->getOperand(0);
4564 EVT EltVT = Src.getValueType();
4565 if (EltVT == MVT::f16)
4566 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
4567
4568 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
4569 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
4570 }
4571
4572 break;
4573 }
45114574 }
45124575 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
45134576 }
18381838 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
18391839
18401840 int64_t Imm = MO.getImm();
1841 switch (operandBitWidth(OperandType)) {
1842 case 32: {
1841 switch (OperandType) {
1842 case AMDGPU::OPERAND_REG_IMM_INT32:
1843 case AMDGPU::OPERAND_REG_IMM_FP32:
1844 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
1845 case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
18431846 int32_t Trunc = static_cast(Imm);
18441847 return Trunc == Imm &&
18451848 AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
18461849 }
1847 case 64: {
1850 case AMDGPU::OPERAND_REG_IMM_INT64:
1851 case AMDGPU::OPERAND_REG_IMM_FP64:
1852 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
1853 case AMDGPU::OPERAND_REG_INLINE_C_FP64: {
18481854 return AMDGPU::isInlinableLiteral64(MO.getImm(),
18491855 ST.hasInv2PiInlineImm());
18501856 }
1851 case 16: {
1857 case AMDGPU::OPERAND_REG_IMM_INT16:
1858 case AMDGPU::OPERAND_REG_IMM_FP16:
1859 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
1860 case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
18521861 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
18531862 // A few special case instructions have 16-bit operands on subtargets
18541863 // where 16-bit instructions are not legal.
18601869 }
18611870
18621871 return false;
1872 }
1873 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
1874 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
1875 uint32_t Trunc = static_cast(Imm);
1876 return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
18631877 }
18641878 default:
18651879 llvm_unreachable("invalid bitwidth");
31163130 case AMDGPU::S_BFE_U64:
31173131 case AMDGPU::S_BFM_B64:
31183132 llvm_unreachable("Moving this op to VALU not implemented");
3133
3134 case AMDGPU::S_PACK_LL_B32_B16:
3135 case AMDGPU::S_PACK_LH_B32_B16:
3136 case AMDGPU::S_PACK_HH_B32_B16: {
3137 movePackToVALU(Worklist, MRI, Inst);
3138 Inst.eraseFromParent();
3139 continue;
3140 }
31193141 }
31203142
31213143 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
34663488 }
34673489 }
34683490
3491 void SIInstrInfo::movePackToVALU(SmallVectorImpl &Worklist,
3492 MachineRegisterInfo &MRI,
3493 MachineInstr &Inst) const {
3494 unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3495 MachineBasicBlock *MBB = Inst.getParent();
3496 MachineOperand &Src0 = Inst.getOperand(1);
3497 MachineOperand &Src1 = Inst.getOperand(2);
3498 const DebugLoc &DL = Inst.getDebugLoc();
3499
3500 switch (Inst.getOpcode()) {
3501 case AMDGPU::S_PACK_LL_B32_B16: {
3502 // v_pack_b32_f16 flushes denormals if not enabled. Use it if the default
3503 // is to leave them untouched.
3504 // XXX: Does this do anything to NaNs?
3505 if (ST.hasFP16Denormals()) {
3506 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_PACK_B32_F16), ResultReg)
3507 .addImm(0) // src0_modifiers
3508 .add(Src0) // src0
3509 .addImm(0) // src1_modifiers
3510 .add(Src1) // src2
3511 .addImm(0) // clamp
3512 .addImm(0); // omod
3513 } else {
3514 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3515 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3516
3517 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
3518 // 0.
3519 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
3520 .addImm(0xffff);
3521
3522 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
3523 .addReg(ImmReg, RegState::Kill)
3524 .add(Src0);
3525
3526 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
3527 .add(Src1)
3528 .addImm(16)
3529 .addReg(TmpReg, RegState::Kill);
3530 }
3531
3532 break;
3533 }
3534 case AMDGPU::S_PACK_LH_B32_B16: {
3535 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3536 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
3537 .addImm(0xffff);
3538 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
3539 .addReg(ImmReg, RegState::Kill)
3540 .add(Src0)
3541 .add(Src1);
3542 break;
3543 }
3544 case AMDGPU::S_PACK_HH_B32_B16: {
3545 unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3546 unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3547 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
3548 .addImm(16)
3549 .add(Src0);
3550 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
3551 .addImm(0xffff);
3552 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
3553 .add(Src1)
3554 .addReg(ImmReg, RegState::Kill)
3555 .addReg(TmpReg, RegState::Kill);
3556 break;
3557 }
3558 default:
3559 llvm_unreachable("unhandled s_pack_* instruction");
3560 }
3561
3562 MachineOperand &Dest = Inst.getOperand(0);
3563 MRI.replaceRegWith(Dest.getReg(), ResultReg);
3564 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3565 }
3566
34693567 void SIInstrInfo::addSCCDefUsersToVALUWorklist(
34703568 MachineInstr &SCCDefInst, SmallVectorImpl &Worklist) const {
34713569 // This assumes that all the users of SCC are in the same block
6868 MachineInstr &Inst) const;
6969 void splitScalar64BitBFE(SmallVectorImpl &Worklist,
7070 MachineInstr &Inst) const;
71 void movePackToVALU(SmallVectorImpl &Worklist,
72 MachineRegisterInfo &MRI,
73 MachineInstr &Inst) const;
7174
7275 void addUsersToMoveToVALUWorklist(
7376 unsigned Reg, MachineRegisterInfo &MRI,
497500 return !RI.isSGPRReg(MRI, Dest);
498501 }
499502
500 static int operandBitWidth(uint8_t OperandType) {
501 switch (OperandType) {
502 case AMDGPU::OPERAND_REG_IMM_INT32:
503 case AMDGPU::OPERAND_REG_IMM_FP32:
504 case AMDGPU::OPERAND_REG_INLINE_C_INT32:
505 case AMDGPU::OPERAND_REG_INLINE_C_FP32:
506 return 32;
507 case AMDGPU::OPERAND_REG_IMM_INT64:
508 case AMDGPU::OPERAND_REG_IMM_FP64:
509 case AMDGPU::OPERAND_REG_INLINE_C_INT64:
510 case AMDGPU::OPERAND_REG_INLINE_C_FP64:
511 return 64;
512 case AMDGPU::OPERAND_REG_INLINE_C_INT16:
513 case AMDGPU::OPERAND_REG_INLINE_C_FP16:
514 case AMDGPU::OPERAND_REG_IMM_INT16:
515 case AMDGPU::OPERAND_REG_IMM_FP16:
516 return 16;
517 default:
518 llvm_unreachable("unexpected operand type");
519 }
520 }
521
522503 bool isInlineConstant(const APInt &Imm) const;
523504
524505 bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const;
195195 return isCBranchSCC(N);
196196 }]>;
197197
198 def lshr_rev : PatFrag <
199 (ops node:$src1, node:$src0),
200 (srl $src0, $src1)
201 >;
202
203 def ashr_rev : PatFrag <
204 (ops node:$src1, node:$src0),
205 (sra $src0, $src1)
206 >;
207
208 def lshl_rev : PatFrag <
209 (ops node:$src1, node:$src0),
210 (shl $src0, $src1)
211 >;
212
198213 multiclass SIAtomicM0Glue2 {
199214
200215 def _glue : SDNode <
657672 int FLAT_SCR = 0x68;
658673 }
659674
675 // This should be kept in sync with SISrcMods enum
660676 def SRCMODS {
661677 int NONE = 0;
662678 int NEG = 1;
663679 int ABS = 2;
664680 int NEG_ABS = 3;
681
682 int NEG_HI = ABS;
683 int OP_SEL_0 = 4;
684 int OP_SEL_1 = 8;
665685 }
666686
667687 def DSTCLAMP {
621621 def : BitConvert ;
622622 def : BitConvert ;
623623 def : BitConvert ;
624 def : BitConvert ;
625 def : BitConvert ;
626 def : BitConvert ;
627 def : BitConvert ;
624628
625629 // 64-bit bitcast
626630 def : BitConvert ;
774778 (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
775779 >;
776780
781 def : Pat <
782 (fneg v2f16:$src),
783 (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), $src)
784 >;
785
786 def : Pat <
787 (fabs v2f16:$src),
788 (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src)
789 >;
790
791 // This is really (fneg (fabs v2f16:$src))
792 //
793 // fabs is not reported as free because there is modifier for it in
794 // VOP3P instructions, so it is turned into the bit op.
795 def : Pat <
796 (fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))),
797 (S_OR_B32 (S_MOV_B32 (i32 0x80008000)), $src) // Set sign bit
798 >;
799
777800 /********** ================== **********/
778801 /********** Immediate Patterns **********/
779802 /********** ================== **********/
11061129 (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0)
11071130 >;
11081131
1132 def : Pat<
1133 (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
1134 (V_PK_MUL_F16 SRCMODS.OP_SEL_1, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
1135 >;
1136
1137
11091138 // Allow integer inputs
11101139 class ExpPattern : Pat<
11111140 (node (i8 timm:$tgt), (i8 timm:$en), vt:$src0, vt:$src1, vt:$src2, vt:$src3, (i1 timm:$compr), (i1 timm:$vm)),
11141143
11151144 def : ExpPattern;
11161145 def : ExpPattern;
1146
1147 def : Pat <
1148 (v2i16 (build_vector i16:$src0, i16:$src1)),
1149 (v2i16 (S_PACK_LL_B32_B16 $src0, $src1))
1150 >;
1151
1152 // With multiple uses of the shift, this will duplicate the shift and
1153 // increase register pressure.
1154 def : Pat <
1155 (v2i16 (build_vector i16:$src0, (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
1156 (v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1))
1157 >;
1158
1159 def : Pat <
1160 (v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))),
1161 (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
1162 (v2i16 (S_PACK_HH_B32_B16 $src0, $src1))
1163 >;
1164
1165 // TODO: Should source modifiers be matched to v_pack_b32_f16?
1166 def : Pat <
1167 (v2f16 (build_vector f16:$src0, f16:$src1)),
1168 (v2f16 (S_PACK_LL_B32_B16 $src0, $src1))
1169 >;
1170
1171 // def : Pat <
1172 // (v2f16 (scalar_to_vector f16:$src0)),
1173 // (COPY $src0)
1174 // >;
1175
1176 // def : Pat <
1177 // (v2i16 (scalar_to_vector i16:$src0)),
1178 // (COPY $src0)
1179 // >;
11171180
11181181 //===----------------------------------------------------------------------===//
11191182 // Fract Patterns
2323 >;
2424
2525 let isCommutable = 1 in {
26 def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile>;
27 def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile>;
28 def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile>;
29 def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile>;
30 def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile>;
26 def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile, fma>;
27 def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile, fadd>;
28 def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile, fmul>;
29 def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile, fmaxnum>;
30 def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile, fminnum>;
3131
32 def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile>;
32 def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile, add>;
3333 def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile>;
34 def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile>;
35 def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile>;
34 def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile, sub>;
35 def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile, mul>;
3636
37 def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile>;
38 def V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile>;
39 def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile>;
40 def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile>;
37 def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile, smin>;
38 def V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile, umin>;
39 def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile, smax>;
40 def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile, umax>;
4141 }
4242
43 def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile>;
44 def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile>;
45 def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile>;
43 def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile, lshl_rev>;
44 def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile, ashr_rev>;
45 def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile, lshr_rev>;
4646
4747 // XXX - Commutable?
4848 def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile>;
0 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
2
3 ; FIXME: Need to handle non-uniform case for function below (load without gep).
4 ; GCN-LABEL: {{^}}v_test_add_v2i16:
5 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
6
7 ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
8 ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
9 define void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
10 %tid = call i32 @llvm.amdgcn.workitem.id.x()
11 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
12 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
13 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
14 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
15 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
16 %add = add <2 x i16> %a, %b
17 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
18 ret void
19 }
20
21 ; GCN-LABEL: {{^}}s_test_add_v2i16:
22 ; GFX9: s_load_dword [[VAL0:s[0-9]+]]
23 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
24 ; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]]
25 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VVAL1]], [[VAL0]]
26
27 ; VI: s_add_i32
28 ; VI: s_add_i32
29 define void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
30 %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
31 %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1
32 %add = add <2 x i16> %a, %b
33 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
34 ret void
35 }
36
37 ; GCN-LABEL: {{^}}s_test_add_self_v2i16:
38 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
39 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VAL]], [[VAL]]
40
41 ; VI: s_add_i32
42 ; VI: s_add_i32
43 define void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
44 %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
45 %add = add <2 x i16> %a, %a
46 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
47 ret void
48 }
49
50 ; FIXME: VI should not scalarize arg access.
51 ; GCN-LABEL: {{^}}s_test_add_v2i16_kernarg:
52 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
53
54 ; VI: v_add_i32
55 ; VI: v_add_i32
56 define void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
57 %add = add <2 x i16> %a, %b
58 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
59 ret void
60 }
61
62 ; GCN-LABEL: {{^}}v_test_add_v2i16_constant:
63 ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
64 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
65
66 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
67 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x1c8, v{{[0-9]+}}
68 define void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
69 %tid = call i32 @llvm.amdgcn.workitem.id.x()
70 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
71 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
72 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
73 %add = add <2 x i16> %a,
74 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
75 ret void
76 }
77
78 ; FIXME: Need to handle non-uniform case for function below (load without gep).
79 ; GCN-LABEL: {{^}}v_test_add_v2i16_neg_constant:
80 ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}}
81 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
82
83 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}}
84 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffc21, v{{[0-9]+}}
85 define void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
86 %tid = call i32 @llvm.amdgcn.workitem.id.x()
87 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
88 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
89 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
90 %add = add <2 x i16> %a,
91 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
92 ret void
93 }
94
95 ; GCN-LABEL: {{^}}v_test_add_v2i16_inline_neg1:
96 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, -1{{$}}
97
98 ; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
99 ; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
100 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD0]]
101 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]]
102 ; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
103 ; VI: v_or_b32_e32
104 define void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
105 %tid = call i32 @llvm.amdgcn.workitem.id.x()
106 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
107 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
108 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
109 %add = add <2 x i16> %a,
110 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
111 ret void
112 }
113
114 ; GCN-LABEL: {{^}}v_test_add_v2i16_inline_lo_zero_hi:
115 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 32{{$}}
116 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
117
118 ; VI-NOT: v_add_u16
119 ; VI: v_add_u16_e32 v{{[0-9]+}}, 32, v{{[0-9]+}}
120 ; VI-NOT: v_add_u16
121 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
122 ; VI: v_or_b32_e32
123 define void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
124 %tid = call i32 @llvm.amdgcn.workitem.id.x()
125 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
126 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
127 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
128 %add = add <2 x i16> %a,
129 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
130 ret void
131 }
132
133 ; The high element gives fp
134 ; GCN-LABEL: {{^}}v_test_add_v2i16_inline_fp_split:
135 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0
136 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
137
138 ; VI-NOT: v_add_u16
139 ; VI: v_add_u16_e32 v{{[0-9]+}}, 0x3f80, v{{[0-9]+}}
140 ; VI-NOT: v_add_u16
141 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
142 ; VI: v_or_b32_e32
143 define void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
144 %tid = call i32 @llvm.amdgcn.workitem.id.x()
145 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
146 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
147 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
148 %add = add <2 x i16> %a,
149 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
150 ret void
151 }
152
153 ; FIXME: Need to handle non-uniform case for function below (load without gep).
154 ; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i32:
155 ; GFX9: flat_load_dword [[A:v[0-9]+]]
156 ; GFX9: flat_load_dword [[B:v[0-9]+]]
157
158 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
159 ; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
160 ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
161 ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
162
163 ; VI: flat_load_ushort v[[A_HI:[0-9]+]]
164 ; VI: flat_load_ushort v[[A_LO:[0-9]+]]
165 ; VI: flat_load_ushort v[[B_HI:[0-9]+]]
166 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
167
168 ; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
169 ; VI-NOT: and
170 ; VI-NOT: shl
171 ; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
172 ; VI-NOT: and
173 ; VI-NOT: shl
174 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
175 define void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
176 %tid = call i32 @llvm.amdgcn.workitem.id.x()
177 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
178 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
179 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
180 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
181 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
182 %add = add <2 x i16> %a, %b
183 %ext = zext <2 x i16> %add to <2 x i32>
184 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
185 ret void
186 }
187
188 ; FIXME: Need to handle non-uniform case for function below (load without gep).
189 ; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i64:
190 ; GFX9: flat_load_dword [[A:v[0-9]+]]
191 ; GFX9: flat_load_dword [[B:v[0-9]+]]
192
193 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
194 ; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
195 ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
196 ; GFX9-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
197 ; GFX9: buffer_store_dwordx4
198
199 ; VI: flat_load_ushort v[[A_LO:[0-9]+]]
200 ; VI: flat_load_ushort v[[A_HI:[0-9]+]]
201 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
202 ; VI: flat_load_ushort v[[B_HI:[0-9]+]]
203
204 ; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
205 ; VI: v_add_u16_e32
206 ; VI: v_add_u16_e32
207 ; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
208
209 ; VI: buffer_store_dwordx4
210 define void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
211 %tid = call i32 @llvm.amdgcn.workitem.id.x()
212 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
213 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
214 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
215 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
216 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
217 %add = add <2 x i16> %a, %b
218 %ext = zext <2 x i16> %add to <2 x i64>
219 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
220 ret void
221 }
222
223 ; FIXME: Need to handle non-uniform case for function below (load without gep).
224 ; GCN-LABEL: {{^}}v_test_add_v2i16_sext_to_v2i32:
225 ; GFX9: flat_load_dword [[A:v[0-9]+]]
226 ; GFX9: flat_load_dword [[B:v[0-9]+]]
227
228 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
229 ; GFX9-DAG: v_bfe_i32 v[[ELT0:[0-9]+]], [[ADD]], 0, 16
230 ; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
231 ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
232
233 ; VI: v_add_u16_e32
234 ; VI: v_add_u16_e32
235 ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
236 ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
237 ; VI: buffer_store_dwordx2
238 define void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
239 %tid = call i32 @llvm.amdgcn.workitem.id.x()
240 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
241 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
242 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
243 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
244 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
245 %add = add <2 x i16> %a, %b
246 %ext = sext <2 x i16> %add to <2 x i32>
247 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
248 ret void
249 }
250
251 ; FIXME: Need to handle non-uniform case for function below (load without gep).
252 ; GCN-LABEL: {{^}}v_test_add_v2i16_sext_to_v2i64:
253 ; GCN: flat_load_dword
254 ; GCN: flat_load_dword
255
256 ; GFX9: v_pk_add_u16
257 ; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
258
259 ; VI: v_add_u16_e32
260 ; VI: v_add_u16_e32
261
262 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
263 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
264 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
265 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
266 define void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
267 %tid = call i32 @llvm.amdgcn.workitem.id.x()
268 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
269 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
270 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
271 %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
272 %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
273 %add = add <2 x i16> %a, %b
274 %ext = sext <2 x i16> %add to <2 x i64>
275 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
276 ret void
277 }
278
279 declare i32 @llvm.amdgcn.workitem.id.x() #0
280
281 attributes #0 = { nounwind readnone }
282 attributes #1 = { nounwind }
2525
2626 define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
2727 %load = load float, float addrspace(1)* %in, align 4
28 %bc = bitcast float %load to <2 x i16>
29 store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4
28 %fadd32 = fadd float %load, 1.0
29 %bc = bitcast float %fadd32 to <2 x i16>
30 %add.bitcast = add <2 x i16> %bc,
31 store <2 x i16> %add.bitcast, <2 x i16> addrspace(1)* %out
3032 ret void
3133 }
3234
3335 define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
3436 %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
35 %bc = bitcast <2 x i16> %load to float
36 store float %bc, float addrspace(1)* %out, align 4
37 %add.v2i16 = add <2 x i16> %load,
38 %bc = bitcast <2 x i16> %add.v2i16 to float
39 %fadd.bitcast = fadd float %bc, 1.0
40 store float %fadd.bitcast, float addrspace(1)* %out
41 ret void
42 }
43
44 define void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
45 %load = load float, float addrspace(1)* %in, align 4
46 %fadd32 = fadd float %load, 1.0
47 %bc = bitcast float %fadd32 to <2 x half>
48 %add.bitcast = fadd <2 x half> %bc,
49 store <2 x half> %add.bitcast, <2 x half> addrspace(1)* %out
50 ret void
51 }
52
53 define void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in) nounwind {
54 %load = load <2 x half>, <2 x half> addrspace(1)* %in, align 4
55 %add.v2f16 = fadd <2 x half> %load,
56 %bc = bitcast <2 x half> %add.v2f16 to float
57 %fadd.bitcast = fadd float %bc, 1.0
58 store float %fadd.bitcast, float addrspace(1)* %out
3759 ret void
3860 }
3961
5779 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
5880 %add = add <2 x i32> %val,
5981 %bc = bitcast <2 x i32> %add to double
60 store double %bc, double addrspace(1)* %out, align 8
82 %fadd.bc = fadd double %bc, 1.0
83 store double %fadd.bc, double addrspace(1)* %out, align 8
6184 ret void
6285 }
6386
0 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
3
4 ; GCN-LABEL: {{^}}s_ashr_v2i16:
5 ; GFX9: s_load_dword [[LHS:s[0-9]+]]
6 ; GFX9: s_load_dword [[RHS:s[0-9]+]]
7 ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
8 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
9
10 ; CIVI: v_ashrrev_i32_e32
11 ; CIVI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
12 ; CIVI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
13 ; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
14 ; CIVI: v_or_b32_e32
15 define void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
16 %result = ashr <2 x i16> %lhs, %rhs
17 store <2 x i16> %result, <2 x i16> addrspace(1)* %out
18 ret void
19 }
20
21 ; GCN-LABEL: {{^}}v_ashr_v2i16:
22 ; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
23 ; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
24 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
25
26 ; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
27 ; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
28 ; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
29 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
30 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
31
32 ; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
33 ; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], [[RHS]]
34 ; CI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
35 ; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, [[LHS]]
36 ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
37 ; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
38 ; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
39 ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
40 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
41 ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
42 define void @v_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
43 %tid = call i32 @llvm.amdgcn.workitem.id.x()
44 %tid.ext = sext i32 %tid to i64
45 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
46 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
47 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1
48 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
49 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
50 %result = ashr <2 x i16> %a, %b
51 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
52 ret void
53 }
54
55 ; GCN-LABEL: {{^}}ashr_v_s_v2i16:
56 ; GFX9: s_load_dword [[RHS:s[0-9]+]]
57 ; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
58 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
59 define void @ashr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
60 %tid = call i32 @llvm.amdgcn.workitem.id.x()
61 %tid.ext = sext i32 %tid to i64
62 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
63 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
64 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
65 %result = ashr <2 x i16> %vgpr, %sgpr
66 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
67 ret void
68 }
69
70 ; GCN-LABEL: {{^}}ashr_s_v_v2i16:
71 ; GFX9: s_load_dword [[LHS:s[0-9]+]]
72 ; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
73 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
74 define void @ashr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
75 %tid = call i32 @llvm.amdgcn.workitem.id.x()
76 %tid.ext = sext i32 %tid to i64
77 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
78 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
79 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
80 %result = ashr <2 x i16> %sgpr, %vgpr
81 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
82 ret void
83 }
84
85 ; GCN-LABEL: {{^}}ashr_imm_v_v2i16:
86 ; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
87 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], -4
88 define void @ashr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
89 %tid = call i32 @llvm.amdgcn.workitem.id.x()
90 %tid.ext = sext i32 %tid to i64
91 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
92 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
93 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
94 %result = ashr <2 x i16> , %vgpr
95 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
96 ret void
97 }
98
99 ; GCN-LABEL: {{^}}ashr_v_imm_v2i16:
100 ; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
101 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], 8, [[LHS]]
102 define void @ashr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
103 %tid = call i32 @llvm.amdgcn.workitem.id.x()
104 %tid.ext = sext i32 %tid to i64
105 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
106 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
107 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
108 %result = ashr <2 x i16> %vgpr,
109 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
110 ret void
111 }
112
113 ; GCN-LABEL: {{^}}v_ashr_v4i16:
114 ; GCN: {{buffer|flat}}_load_dwordx2
115 ; GCN: {{buffer|flat}}_load_dwordx2
116 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
117 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
118 ; GCN: {{buffer|flat}}_store_dwordx2
119 define void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
120 %tid = call i32 @llvm.amdgcn.workitem.id.x()
121 %tid.ext = sext i32 %tid to i64
122 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
123 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
124 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1
125 %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
126 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
127 %result = ashr <4 x i16> %a, %b
128 store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
129 ret void
130 }
131
132 ; GCN-LABEL: {{^}}ashr_v_imm_v4i16:
133 ; GCN: {{buffer|flat}}_load_dwordx2
134 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
135 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
136 ; GCN: {{buffer|flat}}_store_dwordx2
137 define void @ashr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
138 %tid = call i32 @llvm.amdgcn.workitem.id.x()
139 %tid.ext = sext i32 %tid to i64
140 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
141 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
142 %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
143 %result = ashr <4 x i16> %vgpr,
144 store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
145 ret void
146 }
147
148 declare i32 @llvm.amdgcn.workitem.id.x() #1
149
150 attributes #0 = { nounwind }
151 attributes #1 = { nounwind readnone }
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
23
34 ; GCN-LABEL: {{^}}extract_vector_elt_v2i16:
45 ; GCN: s_load_dword [[VEC:s[0-9]+]]
6970 }
7071
7172 ; GCN-LABEL: {{^}}extract_vector_elt_v4i16:
72 ; GCN: buffer_load_ushort
73 ; GCN: buffer_load_ushort
74 ; GCN: buffer_store_short
75 ; GCN: buffer_store_short
73 ; SICIVI: buffer_load_ushort
74 ; SICIVI: buffer_load_ushort
75 ; SICIVI: buffer_store_short
76 ; SICIVI: buffer_store_short
77
78 ; GFX9-DAG: s_load_dword [[LOAD0:s[0-9]+]], s[0:1], 0x2c
79 ; GFX9-DAG: s_load_dword [[LOAD1:s[0-9]+]], s[0:1], 0x30
80 ; GFX9-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], [[LOAD0]]
81 ; GFX9-DAG: buffer_store_short [[VLOAD0]], off
82 ; GFX9-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], [[LOAD1]]
83 ; GFX9-DAG: buffer_store_short [[VLOAD1]], off
7684 define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 {
7785 %p0 = extractelement <4 x i16> %foo, i32 0
7886 %p1 = extractelement <4 x i16> %foo, i32 2
7987 %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10
80 store i16 %p1, i16 addrspace(1)* %out, align 2
81 store i16 %p0, i16 addrspace(1)* %out1, align 2
88 store volatile i16 %p1, i16 addrspace(1)* %out, align 2
89 store volatile i16 %p0, i16 addrspace(1)* %out1, align 2
8290 ret void
8391 }
8492
8795 ; GCN: buffer_load_ushort
8896 ; GCN: buffer_load_ushort
8997
90 ; GCN: buffer_store_short
91 ; GCN: buffer_store_short
92 ; GCN: buffer_store_short
98 ; SICIVI: buffer_store_short
99 ; SICIVI: buffer_store_short
100 ; SICIVI: buffer_store_short
101
102 ; GFX9: buffer_store_dword
103 ; GFX9: buffer_store_dword
93104
94105 ; GCN: buffer_load_ushort
95106 ; GCN: buffer_store_short
101112 }
102113
103114 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i16:
104 ; GCN: buffer_load_ushort
105 ; GCN: buffer_load_ushort
106 ; GCN: buffer_load_ushort
107 ; GCN: buffer_load_ushort
115 ; SICIVI: buffer_load_ushort
116 ; SICIVI: buffer_load_ushort
117 ; SICIVI: buffer_load_ushort
118 ; SICIVI: buffer_load_ushort
108119
109 ; GCN: buffer_store_short
110 ; GCN: buffer_store_short
111 ; GCN: buffer_store_short
112 ; GCN: buffer_store_short
120 ; SICIVI: buffer_store_short
121 ; SICIVI: buffer_store_short
122 ; SICIVI: buffer_store_short
123 ; SICIVI: buffer_store_short
113124
114 ; GCN: buffer_load_ushort
115 ; GCN: buffer_store_short
125 ; SICIVI: buffer_load_ushort
126 ; SICIVI: buffer_store_short
127
128 ; GFX9: s_load_dword
129 ; GFX9: buffer_store_dword
130 ; GFX9: buffer_store_dword
131 ; GFX9: buffer_load_ushort
132 ; GFX9: buffer_store_short
116133 define void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 {
117134 %p0 = extractelement <4 x i16> %foo, i32 %idx
118135 %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
0 ; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
11 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
23
34 ; DAGCombiner will transform:
45 ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
56 ; unless isFabsFree returns true
67
7 ; GCN-LABEL: {{^}}fabs_free_f16:
8 ; GCN-LABEL: {{^}}s_fabs_free_f16:
89 ; GCN: flat_load_ushort [[VAL:v[0-9]+]],
910 ; GCN: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]]
1011 ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1112
12 define void @fabs_free_f16(half addrspace(1)* %out, i16 %in) {
13 define void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
1314 %bc= bitcast i16 %in to half
1415 %fabs = call half @llvm.fabs.f16(half %bc)
1516 store half %fabs, half addrspace(1)* %out
1617 ret void
1718 }
1819
19 ; GCN-LABEL: {{^}}fabs_f16:
20 ; GCN-LABEL: {{^}}s_fabs_f16:
2021 ; CI: flat_load_ushort [[VAL:v[0-9]+]],
2122 ; CI: v_and_b32_e32 [[CVT0:v[0-9]+]], 0x7fff, [[VAL]]
2223 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
23 define void @fabs_f16(half addrspace(1)* %out, half %in) {
24 define void @s_fabs_f16(half addrspace(1)* %out, half %in) {
2425 %fabs = call half @llvm.fabs.f16(half %in)
2526 store half %fabs, half addrspace(1)* %out
2627 ret void
2728 }
2829
2930 ; FIXME: Should be able to use single and
30 ; GCN-LABEL: {{^}}fabs_v2f16:
31
31 ; GCN-LABEL: {{^}}s_fabs_v2f16:
3232 ; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff
33 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
34 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
33 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
34 ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
35 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
36 ; CI: v_or_b32_e32
3537
3638 ; VI: flat_load_ushort [[LO:v[0-9]+]]
3739 ; VI: flat_load_ushort [[HI:v[0-9]+]]
4244 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
4345 ; VI: v_or_b32
4446 ; VI: flat_store_dword
45 define void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
47
48 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
49 ; GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff
50 define void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
4651 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
4752 store <2 x half> %fabs, <2 x half> addrspace(1)* %out
4853 ret void
4954 }
5055
51 ; GCN-LABEL: {{^}}fabs_v4f16:
56 ; GCN-LABEL: {{^}}s_fabs_v4f16:
5257 ; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff
53 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
54 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
55 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
56 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
58 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
59 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
60 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
61 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
5762
5863 ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
5964 ; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
6267 ; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
6368
6469 ; GCN: flat_store_dwordx2
65 define void @fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
70 define void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
6671 %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
6772 store <4 x half> %fabs, <4 x half> addrspace(1)* %out
6873 ret void
8893 ret void
8994 }
9095
91 declare half @llvm.fabs.f16(half) readnone
92 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) readnone
93 declare <4 x half> @llvm.fabs.v4f16(<4 x half>) readnone
96 ; GCN-LABEL: {{^}}v_fabs_v2f16:
97 ; GCN: flat_load_dword [[VAL:v[0-9]+]]
98 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, [[VAL]]
99 define void @v_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
100 %tid = call i32 @llvm.amdgcn.workitem.id.x()
101 %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
102 %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
103 %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
104 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
105 store <2 x half> %fabs, <2 x half> addrspace(1)* %gep.out
106 ret void
107 }
108
109 ; GCN-LABEL: {{^}}fabs_free_v2f16:
110 ; GCN: s_load_dword [[VAL:s[0-9]+]]
111 ; GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff
112 define void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
113 %bc = bitcast i32 %in to <2 x half>
114 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc)
115 store <2 x half> %fabs, <2 x half> addrspace(1)* %out
116 ret void
117 }
118
119 ; GCN-LABEL: {{^}}v_fabs_fold_v2f16:
120 ; GCN: flat_load_dword [[VAL:v[0-9]+]]
121
122 ; CI: v_cvt_f32_f16_e32
123 ; CI: v_cvt_f32_f16_e32
124 ; CI: v_mul_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
125 ; CI: v_mul_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
126 ; CI: v_cvt_f16_f32
127 ; CI: v_cvt_f16_f32
128
129 ; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
130 ; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
131 ; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
132
133 ; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]]
134 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}}
135 define void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
136 %val = load <2 x half>, <2 x half> addrspace(1)* %in
137 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
138 %fmul = fmul <2 x half> %fabs, %val
139 store <2 x half> %fmul, <2 x half> addrspace(1)* %out
140 ret void
141 }
142
143 declare half @llvm.fabs.f16(half) #1
144 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
145 declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
146 declare i32 @llvm.amdgcn.workitem.id.x() #1
147
148 attributes #0 = { nounwind }
149 attributes #1 = { nounwind readnone }
None ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
1 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
12
23 declare half @llvm.fabs.f16(half) #0
34 declare half @llvm.canonicalize.f16(half) #0
203204 }
204205
205206 ; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f16:
206 ; GCN: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, {{v[0-9]+}}
207 ; GCN: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, {{v[0-9]+}}
208 ; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
207 ; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, {{v[0-9]+}}
208 ; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, {{v[0-9]+}}
209 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
210
211 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}}
212 ; GFX9: buffer_store_dword [[REG]]
209213 define void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
210214 %val = load <2 x half>, <2 x half> addrspace(1)* %out
211215 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
215219
216220 ; FIXME: Fold modifier
217221 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16:
218 ; GCN: v_bfe_u32
219 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}}
220 ; GCN: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}}
221 ; GCN: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
222 ; GCN: v_or_b32
222 ; VI: v_bfe_u32
223 ; VI: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}}
224 ; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}}
225 ; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
226 ; VI: v_or_b32
227
228 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
229 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]]{{$}}
223230 ; GCN: buffer_store_dword
224231 define void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
225232 %val = load <2 x half>, <2 x half> addrspace(1)* %out
230237 }
231238
232239 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16:
233 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
234 ; GCN: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}}
235 ; GCN: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
236 ; GCN: v_or_b32
240 ; VI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
241 ; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}}
242 ; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
243 ; VI: v_or_b32
244
245 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
246 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]] neg_lo:[0,1] neg_hi:[0,1]{{$}}
237247 ; GCN: buffer_store_dword
238248 define void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
239249 %val = load <2 x half>, <2 x half> addrspace(1)* %out
246256
247257 ; FIXME: Fold modifier
248258 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16:
249 ; GCN: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}}
250 ; GCN: v_lshrrev_b32_e32 [[FNEG_HI:v[0-9]+]], 16, [[FNEG]]
251 ; GCN: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]]
252 ; GCN: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, [[FNEG_HI]]
253 ; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
259 ; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}}
260 ; VI: v_lshrrev_b32_e32 [[FNEG_HI:v[0-9]+]], 16, [[FNEG]]
261 ; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]]
262 ; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, [[FNEG_HI]]
263 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
264
265 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}}
266 ; GFX9: buffer_store_dword [[REG]]
254267 define void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
255268 %val = load <2 x half>, <2 x half> addrspace(1)* %out
256269 %fneg.val = fsub <2 x half> , %val
260273 }
261274
262275 ; GCN-LABEL: {{^}}s_test_canonicalize_var_v2f16:
263 ; GCN: v_mul_f16_e64 [[REG0:v[0-9]+]], 1.0, {{s[0-9]+}}
264 ; GCN: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}}
265 ; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
276 ; VI: v_mul_f16_e64 [[REG0:v[0-9]+]], 1.0, {{s[0-9]+}}
277 ; VI: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}}
278 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
279
280 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{s[0-9]+$}}
281 ; GFX9: buffer_store_dword [[REG]]
266282 define void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 {
267283 %val = bitcast i32 %val.arg to <2 x half>
268284 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
23
34 declare half @llvm.copysign.f16(half, half)
45 declare float @llvm.copysign.f32(float, float)
78 declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>)
89 declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>)
910
10 ; FUNC-LABEL: {{^}}test_copysign_f16:
11 ; GCN-LABEL: {{^}}test_copysign_f16:
1112 ; SI: buffer_load_ushort v[[MAG:[0-9]+]]
1213 ; SI: buffer_load_ushort v[[SIGN:[0-9]+]]
1314 ; SI: s_brev_b32 s[[CONST:[0-9]+]], -2
3334 ret void
3435 }
3536
36 ; FUNC-LABEL: {{^}}test_copysign_out_f32_mag_f16_sign_f32:
37 ; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f16_sign_f32:
3738 ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
3839 ; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]]
3940 ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
5455 ret void
5556 }
5657
57 ; FUNC-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64:
58 ; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64:
5859 ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
5960 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
6061 ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
7677 ret void
7778 }
7879
79 ; FUNC-LABEL: {{^}}test_copysign_out_f32_mag_f32_sign_f16:
80 ; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f32_sign_f16:
8081 ; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]]
8182 ; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
8283 ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
99100 ret void
100101 }
101102
102 ; FUNC-LABEL: {{^}}test_copysign_out_f64_mag_f64_sign_f16:
103 ; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f64_sign_f16:
103104 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}}
104105 ; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
105106 ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
122123 ret void
123124 }
124125
125 ; FUNC-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f32:
126 ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f32:
126127 ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
127128 ; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]]
128129 ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
147148 ret void
148149 }
149150
150 ; FUNC-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64:
151 ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64:
151152 ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
152153 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
153154 ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
172173 ret void
173174 }
174175
175 ; FUNC-LABEL: {{^}}test_copysign_out_f16_mag_f32_sign_f16:
176 ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f32_sign_f16:
176177 ; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]]
177178 ; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
178179 ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
199200 ret void
200201 }
201202
202 ; FUNC-LABEL: {{^}}test_copysign_out_f16_mag_f64_sign_f16:
203 ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f64_sign_f16:
203204 ; GCN: v_bfi_b32
204205 ; GCN: s_endpgm
205206 define void @test_copysign_out_f16_mag_f64_sign_f16(
215216 ret void
216217 }
217218
218 ; FUNC-LABEL: {{^}}test_copysign_v2f16:
219 ; GCN-LABEL: {{^}}test_copysign_v2f16:
219220 ; GCN: v_bfi_b32
220221 ; GCN: v_bfi_b32
221222 ; GCN: s_endpgm
229230 ret void
230231 }
231232
232 ; FUNC-LABEL: {{^}}test_copysign_v3f16:
233 ; GCN-LABEL: {{^}}test_copysign_v3f16:
233234 ; GCN: v_bfi_b32
234235 ; GCN: v_bfi_b32
235236 ; GCN: v_bfi_b32
244245 ret void
245246 }
246247
247 ; FUNC-LABEL: {{^}}test_copysign_v4f16:
248 ; GCN-LABEL: {{^}}test_copysign_v4f16:
248249 ; GCN: v_bfi_b32
249250 ; GCN: v_bfi_b32
250251 ; GCN: v_bfi_b32
0 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
1 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
4
5 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
6 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
7 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
9
10 declare i32 @llvm.amdgcn.workitem.id.x() #1
11 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
12 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
13
14 ; GCN-LABEL: {{^}}fmuladd_v2f16:
15 ; GFX9-FLUSH: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
16 ; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
17
18 ; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
19 define void @fmuladd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
20 <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 {
21 %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1
22 %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2
23 %r2 = load <2 x half>, <2 x half> addrspace(1)* %in3
24 %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2)
25 store <2 x half> %r3, <2 x half> addrspace(1)* %out
26 ret void
27 }
28
29 ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_v2f16:
30 ; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
31 ; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
32 ; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
33 ; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
34
35 ; GFX9-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
36
37 ; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
38 ; GFX9-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
39 define void @fmuladd_2.0_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
40 %tid = call i32 @llvm.amdgcn.workitem.id.x()
41 %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
42 %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
43 %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
44
45 %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0
46 %r2 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1
47
48 %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> , <2 x half> %r1, <2 x half> %r2)
49 store <2 x half> %r3, <2 x half> addrspace(1)* %gep.out
50 ret void
51 }
52
53 ; GCN-LABEL: {{^}}fmuladd_a_2.0_b_v2f16:
54 ; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
55 ; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
56 ; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
57 ; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
58
59 ; GFX9-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
60
61 ; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
62 ; GFX9-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
63 define void @fmuladd_a_2.0_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
64 %tid = call i32 @llvm.amdgcn.workitem.id.x()
65 %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
66 %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
67 %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
68
69 %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0
70 %r2 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1
71
72 %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %r1, <2 x half> , <2 x half> %r2)
73 store <2 x half> %r3, <2 x half> addrspace(1)* %gep.out
74 ret void
75 }
76
77 ; GCN-LABEL: {{^}}fadd_a_a_b_v2f16:
78 ; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
79 ; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
80 ; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
81 ; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
82
83 ; GFX9-DENORM-STRICT: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
84 ; GFX9-DENORM-STRICT: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
85
86 ; GFX9-DENORM-CONTRACT: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
87 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
88 define void @fadd_a_a_b_v2f16(<2 x half> addrspace(1)* %out,
89 <2 x half> addrspace(1)* %in1,
90 <2 x half> addrspace(1)* %in2) #0 {
91 %tid = call i32 @llvm.amdgcn.workitem.id.x()
92 %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
93 %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
94 %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
95
96 %r0 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0
97 %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1
98
99 %add.0 = fadd <2 x half> %r0, %r0
100 %add.1 = fadd <2 x half> %add.0, %r1
101 store <2 x half> %add.1, <2 x half> addrspace(1)* %gep.out
102 ret void
103 }
104
105 attributes #0 = { nounwind }
106 attributes #1 = { nounwind readnone }
None ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
0 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=CIVI %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GFX89 -check-prefix=GCN -check-prefix=CIVI %s
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=GCN %s
23
34 ; GCN-LABEL: {{^}}fneg_fabs_fadd_f16:
45 ; CI: v_cvt_f32_f16_e32
56 ; CI: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |v{{[0-9]+}}|
67 ; CI: v_subrev_f32_e32 v{{[0-9]+}}, [[CVT_ABS_X]], v{{[0-9]+}}
78
8 ; VI-NOT: _and
9 ; VI: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|
9 ; GFX89-NOT: _and
10 ; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|
1011 define void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) {
1112 %fabs = call half @llvm.fabs.f16(half %x)
12 %fsub = fsub half -0.000000e+00, %fabs
13 %fsub = fsub half -0.0, %fabs
1314 %fadd = fadd half %y, %fsub
1415 store half %fadd, half addrspace(1)* %out, align 2
1516 ret void
2122 ; CI: v_mul_f32_e32 {{v[0-9]+}}, [[CVT_NEG_ABS_X]], {{v[0-9]+}}
2223 ; CI: v_cvt_f16_f32_e32
2324
24 ; VI-NOT: _and
25 ; VI: v_mul_f16_e64 [[MUL:v[0-9]+]], {{v[0-9]+}}, -|{{v[0-9]+}}|
26 ; VI-NOT: [[MUL]]
27 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
25 ; GFX89-NOT: _and
26 ; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{v[0-9]+}}, -|{{v[0-9]+}}|
27 ; GFX89-NOT: [[MUL]]
28 ; GFX89: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
2829 define void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
2930 %fabs = call half @llvm.fabs.f16(half %x)
30 %fsub = fsub half -0.000000e+00, %fabs
31 %fsub = fsub half -0.0, %fabs
3132 %fmul = fmul half %y, %fsub
3233 store half %fmul, half addrspace(1)* %out, align 2
3334 ret void
4243 define void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
4344 %bc = bitcast i16 %in to half
4445 %fabs = call half @llvm.fabs.f16(half %bc)
45 %fsub = fsub half -0.000000e+00, %fabs
46 %fsub = fsub half -0.0, %fabs
4647 store half %fsub, half addrspace(1)* %out
4748 ret void
4849 }
4950
50 ; FIXME: Should use or
5151 ; GCN-LABEL: {{^}}fneg_fabs_f16:
5252 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
5353 define void @fneg_fabs_f16(half addrspace(1)* %out, half %in) {
5454 %fabs = call half @llvm.fabs.f16(half %in)
55 %fsub = fsub half -0.000000e+00, %fabs
55 %fsub = fsub half -0.0, %fabs
5656 store half %fsub, half addrspace(1)* %out, align 2
5757 ret void
5858 }
6262 define void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
6363 %val = load half, half addrspace(1)* %in, align 2
6464 %fabs = call half @llvm.fabs.f16(half %val)
65 %fsub = fsub half -0.000000e+00, %fabs
65 %fsub = fsub half -0.0, %fabs
6666 store half %fsub, half addrspace(1)* %out, align 2
6767 ret void
6868 }
6969
7070 ; FIXME: single bit op
71 ; GCN-LABEL: {{^}}fneg_fabs_v2f16:
72 ; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
73 ; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
74 ; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
75 ; GCN: store_dword
76 define void @fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
71 ; GCN-LABEL: {{^}}s_fneg_fabs_v2f16:
72 ; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
73 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
74 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
75 ; CIVI: flat_store_dword
76
77 ; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
78 define void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
7779 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
78 %fsub = fsub <2 x half> , %fabs
79 store <2 x half> %fsub, <2 x half> addrspace(1)* %out
80 %fneg.fabs = fsub <2 x half> , %fabs
81 store <2 x half> %fneg.fabs, <2 x half> addrspace(1)* %out
8082 ret void
8183 }
8284
8385 ; GCN-LABEL: {{^}}fneg_fabs_v4f16:
84 ; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
85 ; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
86 ; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
87 ; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
88 ; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
89 ; GCN: store_dwordx2
86 ; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
87 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
88 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
89 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
90 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
91
92 ; GFX9: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
93 ; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
94 ; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
95
96 ; GCN: flat_store_dwordx2
9097 define void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
9198 %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
92 %fsub = fsub <4 x half> 00000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %fabs
99 %fsub = fsub <4 x half> , half -0.0, half -0.0, half -0.0>, %fabs
93100 store <4 x half> %fsub, <4 x half> addrspace(1)* %out
94101 ret void
95102 }
96103
97 declare half @llvm.fabs.f16(half) readnone
98 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) readnone
99 declare <4 x half> @llvm.fabs.v4f16(<4 x half>) readnone
104 ; GCN-LABEL: {{^}}fold_user_fneg_fabs_v2f16:
105 ; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
106 ; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
107 ; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
108 ; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
109
110 ; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0
111 ; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0
112
113 ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
114 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0]
115 define void @fold_user_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
116 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
117 %fneg.fabs = fsub <2 x half> , %fabs
118 %mul = fmul <2 x half> %fneg.fabs,
119 store <2 x half> %mul, <2 x half> addrspace(1)* %out
120 ret void
121 }
122
123 ; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_v2f16:
124 ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
125 ; GFX9: v_mov_b32_e32 [[VABS:v[0-9]+]], [[ABS]]
126 ; GFX9: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VABS]]
127 define void @s_fneg_multi_use_fabs_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
128 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
129 %fneg = fsub <2 x half> , %fabs
130 store <2 x half> %fabs, <2 x half> addrspace(1)* %out0
131 store <2 x half> %fneg, <2 x half> addrspace(1)* %out1
132 ret void
133 }
134
135 ; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_foldable_neg_v2f16:
136 ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
137 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0]
138 define void @s_fneg_multi_use_fabs_foldable_neg_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
139 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
140 %fneg = fsub <2 x half> , %fabs
141 %mul = fmul <2 x half> %fneg,
142 store <2 x half> %fabs, <2 x half> addrspace(1)* %out0
143 store <2 x half> %mul, <2 x half> addrspace(1)* %out1
144 ret void
145 }
146
147 declare half @llvm.fabs.f16(half) #1
148 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
149 declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
150
151 attributes #0 = { nounwind }
152 attributes #1 = { nounwind readnone }
None ; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
23
34 ; FIXME: Should be able to do scalar op
4 ; FUNC-LABEL: {{^}}s_fneg_f16:
5
6 define void @s_fneg_f16(half addrspace(1)* %out, half %in) {
7 %fneg = fsub half -0.000000e+00, %in
5 ; GCN-LABEL: {{^}}s_fneg_f16:
6 define void @s_fneg_f16(half addrspace(1)* %out, half %in) #0 {
7 %fneg = fsub half -0.0, %in
88 store half %fneg, half addrspace(1)* %out
99 ret void
1010 }
1212 ; FIXME: Should be able to use bit operations when illegal type as
1313 ; well.
1414
15 ; FUNC-LABEL: {{^}}v_fneg_f16:
15 ; GCN-LABEL: {{^}}v_fneg_f16:
1616 ; GCN: flat_load_ushort [[VAL:v[0-9]+]],
1717 ; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]]
1818 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
1919 ; SI: buffer_store_short [[XOR]]
20 define void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
21 %val = load half, half addrspace(1)* %in, align 2
22 %fneg = fsub half -0.000000e+00, %val
23 store half %fneg, half addrspace(1)* %out
20 define void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
21 %tid = call i32 @llvm.amdgcn.workitem.id.x()
22 %gep.in = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid
23 %gep.out = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid
24 %val = load half, half addrspace(1)* %gep.in, align 2
25 %fneg = fsub half -0.0, %val
26 store half %fneg, half addrspace(1)* %gep.out
2427 ret void
2528 }
2629
27 ; FUNC-LABEL: {{^}}fneg_free_f16:
30 ; GCN-LABEL: {{^}}fneg_free_f16:
2831 ; GCN: flat_load_ushort [[NEG_VALUE:v[0-9]+]],
2932
3033 ; XCI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}}
3134 ; CI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[NEG_VALUE]]
3235 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
33 define void @fneg_free_f16(half addrspace(1)* %out, i16 %in) {
36 define void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {
3437 %bc = bitcast i16 %in to half
3538 %fsub = fsub half -0.0, %bc
3639 store half %fsub, half addrspace(1)* %out
3740 ret void
3841 }
3942
40 ; FUNC-LABEL: {{^}}v_fneg_fold_f16:
43 ; GCN-LABEL: {{^}}v_fneg_fold_f16:
4144 ; GCN: flat_load_ushort [[NEG_VALUE:v[0-9]+]]
4245
4346 ; CI-DAG: v_cvt_f32_f16_e32 [[CVT_VAL:v[0-9]+]], [[NEG_VALUE]]
4851
4952 ; VI-NOT: [[NEG_VALUE]]
5053 ; VI: v_mul_f16_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
51 define void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
54 define void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
5255 %val = load half, half addrspace(1)* %in
5356 %fsub = fsub half -0.0, %val
5457 %fmul = fmul half %fsub, %val
5558 store half %fmul, half addrspace(1)* %out
5659 ret void
5760 }
61
62 ; FIXME: Terrible code with VI and even worse with SI/CI
63 ; GCN-LABEL: {{^}}s_fneg_v2f16:
64 ; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
65 ; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
66 ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
67 ; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
68 ; CI: v_or_b32_e32
69
70 ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x8000{{$}}
71 ; VI: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
72 ; VI: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
73
74 ; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
75 define void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
76 %fneg = fsub <2 x half> , %in
77 store <2 x half> %fneg, <2 x half> addrspace(1)* %out
78 ret void
79 }
80
81 ; GCN-LABEL: {{^}}v_fneg_v2f16:
82 ; GCN: flat_load_dword [[VAL:v[0-9]+]]
83 ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VAL]]
84 define void @v_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
85 %tid = call i32 @llvm.amdgcn.workitem.id.x()
86 %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
87 %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
88 %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
89 %fneg = fsub <2 x half> , %val
90 store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out
91 ret void
92 }
93
94 ; GCN-LABEL: {{^}}fneg_free_v2f16:
95 ; GCN: s_load_dword [[VAL:s[0-9]+]]
96 ; CIVI: s_xor_b32 s{{[0-9]+}}, [[VAL]], 0x80008000
97
98 ; GFX9: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
99 ; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VVAL]]
100 define void @fneg_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
101 %bc = bitcast i32 %in to <2 x half>
102 %fsub = fsub <2 x half> , %bc
103 store <2 x half> %fsub, <2 x half> addrspace(1)* %out
104 ret void
105 }
106
107 ; GCN-LABEL: {{^}}v_fneg_fold_v2f16:
108 ; GCN: flat_load_dword [[VAL:v[0-9]+]]
109
110 ; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}
111 ; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}
112 ; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
113 ; CI: v_cvt_f16_f32
114 ; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
115 ; CI: v_cvt_f16_f32
116
117 ; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
118 ; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
119 ; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
120
121 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,0] neg_hi:[1,0]{{$}}
122 define void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
123 %val = load <2 x half>, <2 x half> addrspace(1)* %in
124 %fsub = fsub <2 x half> , %val
125 %fmul = fmul <2 x half> %fsub, %val
126 store <2 x half> %fmul, <2 x half> addrspace(1)* %out
127 ret void
128 }
129
130 declare i32 @llvm.amdgcn.workitem.id.x() #1
131
132 attributes #0 = { nounwind }
133 attributes #1 = { nounwind readnone }
None ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
23
34 ; GCN-LABEL: {{^}}fpext_f16_to_f32
45 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
78 ; GCN: s_endpgm
89 define void @fpext_f16_to_f32(
910 float addrspace(1)* %r,
10 half addrspace(1)* %a) {
11 half addrspace(1)* %a) #0 {
1112 entry:
1213 %a.val = load half, half addrspace(1)* %a
1314 %r.val = fpext half %a.val to float
2324 ; GCN: s_endpgm
2425 define void @fpext_f16_to_f64(
2526 double addrspace(1)* %r,
26 half addrspace(1)* %a) {
27 half addrspace(1)* %a) #0 {
2728 entry:
2829 %a.val = load half, half addrspace(1)* %a
2930 %r.val = fpext half %a.val to double
3334
3435 ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f32
3536 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
36 ; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
37 ; GCN: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]]
37 ; GFX89-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
38 ; GCN-DAG: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]]
3839 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
3940 ; GCN: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]]
4041 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}}
4142 ; GCN: s_endpgm
4243 define void @fpext_v2f16_to_v2f32(
4344 <2 x float> addrspace(1)* %r,
44 <2 x half> addrspace(1)* %a) {
45 <2 x half> addrspace(1)* %a) #0 {
4546 entry:
4647 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
4748 %r.val = fpext <2 x half> %a.val to <2 x float>
5051 }
5152
5253 ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f64
53 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
54 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
55 ; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
56 ; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
57 ; GCN: v_cvt_f64_f32_e32 v{{\[}}{{[0-9]+}}:[[R_F64_3:[0-9]+]]{{\]}}, v[[A_F32_1]]
58 ; GCN: v_cvt_f64_f32_e32 v{{\[}}[[R_F64_0:[0-9]+]]:{{[0-9]+}}{{\]}}, v[[A_F32_0]]
59 ; GCN: buffer_store_dwordx4 v{{\[}}[[R_F64_0]]:[[R_F64_3]]{{\]}}
54 ; GCN: buffer_load_dword
55 ; GCN-DAG: v_lshrrev_b32_e32
56 ; GCN-DAG: v_cvt_f32_f16_e32
57 ; GCN: v_cvt_f32_f16_e32
58
59 ; GCN: v_cvt_f64_f32_e32
60 ; GCN: v_cvt_f64_f32_e32
61 ; GCN: buffer_store_dwordx4
6062 ; GCN: s_endpgm
6163 define void @fpext_v2f16_to_v2f64(
6264 <2 x double> addrspace(1)* %r,
128130
129131 ; FIXME: Using the source modifier here only wastes code size
130132 ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
131 ; VI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]]
133 ; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]]
132134
133135 ; GCN: store_dword [[CVT]]
134136 ; GCN: store_short [[XOR]]
151153 ; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA]], [[CVTA_NEG]]
152154 ; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]]
153155
154 ; VI-DAG: v_cvt_f32_f16_e64 [[CVT_NEGA:v[0-9]+]], -[[A]]
155 ; VI: v_mul_f16_e64 [[MUL:v[0-9]+]], -[[A]], [[A]]
156 ; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT_NEGA:v[0-9]+]], -[[A]]
157 ; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], -[[A]], [[A]]
156158
157159 ; GCN: buffer_store_dword [[CVTA_NEG]]
158160 ; GCN: buffer_store_short [[MUL]]
197199 ; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]]
198200 ; SI: v_and_b32_e32 [[ABS_A:v[0-9]+]], 0x7fffffff, [[CVTA]]
199201
200 ; VI-DAG: v_cvt_f32_f16_e64 [[ABS_A:v[0-9]+]], |[[A]]|
201 ; VI: v_mul_f16_e64 [[MUL:v[0-9]+]], |[[A]]|, [[A]]
202 ; GFX89-DAG: v_cvt_f32_f16_e64 [[ABS_A:v[0-9]+]], |[[A]]|
203 ; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], |[[A]]|, [[A]]
202204
203205 ; GCN: buffer_store_dword [[ABS_A]]
204206 ; GCN: buffer_store_short [[MUL]]
244246 ; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]]
245247 ; SI: v_or_b32_e32 [[FABS_FNEG:v[0-9]+]], 0x80000000, [[CVTA]]
246248
247 ; VI-DAG: v_cvt_f32_f16_e64 [[FABS_FNEG:v[0-9]+]], -|[[A]]|
248 ; VI-DAG: v_mul_f16_e64 [[MUL:v[0-9]+]], -|[[A]]|, [[A]]
249 ; GFX89-DAG: v_cvt_f32_f16_e64 [[FABS_FNEG:v[0-9]+]], -|[[A]]|
250 ; GFX89-DAG: v_mul_f16_e64 [[MUL:v[0-9]+]], -|[[A]]|, [[A]]
249251
250252 ; GCN: buffer_store_dword [[FABS_FNEG]]
251253 ; GCN: buffer_store_short [[MUL]]
None ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,+fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-DENORM %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-FLUSH %s
24
35 ; GCN-LABEL: {{^}}fptrunc_f32_to_f16:
46 ; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
3537 ; GCN: buffer_load_dwordx2 v{{\[}}[[A_F32_0:[0-9]+]]:[[A_F32_1:[0-9]+]]{{\]}}
3638 ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
3739 ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
38 ; GCN-DAG: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
39 ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
40 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
40 ; SIVI-DAG: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
41 ; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
42 ; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
43
44 ; GFX9-FLUSH: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
45 ; GFX9-FLUSH: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]
46
47 ; GFX9-DENORM: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
48
4149 ; GCN: buffer_store_dword v[[R_V2_F16]]
4250 ; GCN: s_endpgm
4351 define void @fptrunc_v2f32_to_v2f16(
5664 ; GCN: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}}
5765 ; GCN: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
5866 ; GCN: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
59 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
60 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
61 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
67
68 ; SIVI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
69 ; SIVI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
70 ; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
71
72 ; GFX9-FLUSH: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
73 ; GFX9-FLUSH: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]
74
75 ; GFX9-DENORM: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
76
6277 ; GCN: buffer_store_dword v[[R_V2_F16]]
6378 define void @fptrunc_v2f64_to_v2f16(
6479 <2 x half> addrspace(1)* %r,
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
23
34 ; GCN-LABEL: {{^}}fsub_f16:
45 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
78 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
89 ; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
910 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
10 ; VI: v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
11 ; GFX89: v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
1112 ; GCN: buffer_store_short v[[R_F16]]
1213 ; GCN: s_endpgm
1314 define void @fsub_f16(
2728 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
2829 ; SI: v_sub_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]]
2930 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
30 ; VI: v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
31 ; GFX89: v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
3132 ; GCN: buffer_store_short v[[R_F16]]
3233 ; GCN: s_endpgm
3334 define void @fsub_f16_imm_a(
4546 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
4647 ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], -2.0, v[[A_F32]]
4748 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
48 ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]]
49 ; GFX89: v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]]
4950 ; GCN: buffer_store_short v[[R_F16]]
5051 ; GCN: s_endpgm
5152 define void @fsub_f16_imm_b(
6162 ; GCN-LABEL: {{^}}fsub_v2f16:
6263 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
6364 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
64 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
65 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
65 ; SIVI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
66 ; SIVI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
67
6668 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
6769 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
6870 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
7173 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
7274 ; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
7375 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
74 ; VI: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
75 ; VI: v_subrev_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
76 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
77 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
78 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
76
77 ; VI-DAG: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
78 ; VI: v_subrev_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
79
80 ; SIVI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
81 ; SIVI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
82 ; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
83
84 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
85
7986 ; GCN: buffer_store_dword v[[R_V2_F16]]
8087 ; GCN: s_endpgm
8188 define void @fsub_v2f16(
93100 ; GCN-LABEL: {{^}}fsub_v2f16_imm_a:
94101 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
95102 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
96 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
103 ; SIVI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
104
97105 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
98106 ; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
99107 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
100108 ; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
101109 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
110
102111 ; VI: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
103112 ; VI: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
104 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
105 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
106 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
113
114 ; SIVI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
115 ; SIVI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
116 ; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
117
118 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00
119 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
120
107121 ; GCN: buffer_store_dword v[[R_V2_F16]]
108122 ; GCN: s_endpgm
109123 define void @fsub_v2f16_imm_a(
119133 ; GCN-LABEL: {{^}}fsub_v2f16_imm_b:
120134 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
121135 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
122 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
136 ; SIVI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
137
123138 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
124139 ; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]]
125140 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
127142 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
128143 ; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
129144 ; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]]
130 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
131 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
132 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
145
146 ; SIVI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
147 ; SIVI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
148 ; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
149
150 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000
151 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[A_V2_F16]]{{$}}
152
133153 ; GCN: buffer_store_dword v[[R_V2_F16]]
134154 ; GCN: s_endpgm
135155 define void @fsub_v2f16_imm_b(
0 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
3 ; FIXME: Merge into imm.ll
4
5 ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
6 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
7 ; GCN: buffer_store_dword [[REG]]
8 define void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 {
9 store <2 x i16> , <2 x i16> addrspace(1)* %out
10 ret void
11 }
12
13 ; GCN-LABEL: {{^}}store_inline_imm_0.0_v2f16:
14 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
15 ; GCN: buffer_store_dword [[REG]]
16 define void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
17 store <2 x half> , <2 x half> addrspace(1)* %out
18 ret void
19 }
20
21 ; GCN-LABEL: {{^}}store_imm_neg_0.0_v2f16:
22 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
23 ; GCN: buffer_store_dword [[REG]]
24 define void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
25 store <2 x half> , <2 x half> addrspace(1)* %out
26 ret void
27 }
28
29 ; GCN-LABEL: {{^}}store_inline_imm_0.5_v2f16:
30 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x38003800{{$}}
31 ; GCN: buffer_store_dword [[REG]]
32 define void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
33 store <2 x half> , <2 x half> addrspace(1)* %out
34 ret void
35 }
36
37 ; GCN-LABEL: {{^}}store_inline_imm_m_0.5_v2f16:
38 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800b800{{$}}
39 ; GCN: buffer_store_dword [[REG]]
40 define void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
41 store <2 x half> , <2 x half> addrspace(1)* %out
42 ret void
43 }
44
45 ; GCN-LABEL: {{^}}store_inline_imm_1.0_v2f16:
46 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}}
47 ; GCN: buffer_store_dword [[REG]]
48 define void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
49 store <2 x half> , <2 x half> addrspace(1)* %out
50 ret void
51 }
52
53 ; GCN-LABEL: {{^}}store_inline_imm_m_1.0_v2f16:
54 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}}
55 ; GCN: buffer_store_dword [[REG]]
56 define void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
57 store <2 x half> , <2 x half> addrspace(1)* %out
58 ret void
59 }
60
61 ; GCN-LABEL: {{^}}store_inline_imm_2.0_v2f16:
62 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x40004000{{$}}
63 ; GCN: buffer_store_dword [[REG]]
64 define void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
65 store <2 x half> , <2 x half> addrspace(1)* %out
66 ret void
67 }
68
69 ; GCN-LABEL: {{^}}store_inline_imm_m_2.0_v2f16:
70 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000c000{{$}}
71 ; GCN: buffer_store_dword [[REG]]
72 define void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
73 store <2 x half> , <2 x half> addrspace(1)* %out
74 ret void
75 }
76
77 ; GCN-LABEL: {{^}}store_inline_imm_4.0_v2f16:
78 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x44004400{{$}}
79 ; GCN: buffer_store_dword [[REG]]
80 define void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
81 store <2 x half> , <2 x half> addrspace(1)* %out
82 ret void
83 }
84
85 ; GCN-LABEL: {{^}}store_inline_imm_m_4.0_v2f16:
86 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400c400{{$}}
87 ; GCN: buffer_store_dword [[REG]]
88 define void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
89 store <2 x half> , <2 x half> addrspace(1)* %out
90 ret void
91 }
92
93 ; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_v2f16:
94 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x31183118{{$}}
95 ; GCN: buffer_store_dword [[REG]]
96 define void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
97 store <2 x half> , <2 x half> addrspace(1)* %out
98 ret void
99 }
100
101 ; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_v2f16:
102 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118b118{{$}}
103 ; GCN: buffer_store_dword [[REG]]
104 define void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
105 store <2 x half> , <2 x half> addrspace(1)* %out
106 ret void
107 }
108
109 ; GCN-LABEL: {{^}}store_literal_imm_v2f16:
110 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c006c00
111 ; GCN: buffer_store_dword [[REG]]
112 define void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 {
113 store <2 x half> , <2 x half> addrspace(1)* %out
114 ret void
115 }
116
117 ; GCN-LABEL: {{^}}add_inline_imm_0.0_v2f16:
118 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
119 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0{{$}}
120 ; GFX9: buffer_store_dword [[REG]]
121
122 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
123 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
124 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL0]]
125 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL1]]
126 ; VI: v_or_b32
127 ; VI: buffer_store_dword
128 define void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
129 %y = fadd <2 x half> %x,
130 store <2 x half> %y, <2 x half> addrspace(1)* %out
131 ret void
132 }
133
134 ; GCN-LABEL: {{^}}add_inline_imm_0.5_v2f16:
135 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
136 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5{{$}}
137 ; GFX9: buffer_store_dword [[REG]]
138
139 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
140 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
141 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL0]]
142 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL1]]
143 ; VI: v_or_b32
144 ; VI: buffer_store_dword
145 define void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
146 %y = fadd <2 x half> %x,
147 store <2 x half> %y, <2 x half> addrspace(1)* %out
148 ret void
149 }
150
151 ; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_v2f16:
152 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
153 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5{{$}}
154 ; GFX9: buffer_store_dword [[REG]]
155
156 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
157 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
158 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL0]]
159 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL1]]
160 ; VI: v_or_b32
161 ; VI: buffer_store_dword
162 define void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
163 %y = fadd <2 x half> %x,
164 store <2 x half> %y, <2 x half> addrspace(1)* %out
165 ret void
166 }
167
168 ; GCN-LABEL: {{^}}add_inline_imm_1.0_v2f16:
169 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
170 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1.0{{$}}
171 ; GFX9: buffer_store_dword [[REG]]
172
173 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
174 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
175 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL0]]
176 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL1]]
177 ; VI: v_or_b32
178 ; VI: buffer_store_dword
179 define void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
180 %y = fadd <2 x half> %x,
181 store <2 x half> %y, <2 x half> addrspace(1)* %out
182 ret void
183 }
184
185 ; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_v2f16:
186 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
187 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1.0{{$}}
188 ; GFX9: buffer_store_dword [[REG]]
189
190 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
191 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
192 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL0]]
193 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL1]]
194 ; VI: v_or_b32
195 ; VI: buffer_store_dword
196 define void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
197 %y = fadd <2 x half> %x,
198 store <2 x half> %y, <2 x half> addrspace(1)* %out
199 ret void
200 }
201
202 ; GCN-LABEL: {{^}}add_inline_imm_2.0_v2f16:
203 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
204 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2.0{{$}}
205 ; GFX9: buffer_store_dword [[REG]]
206
207 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
208 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
209 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL0]]
210 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL1]]
211 ; VI: v_or_b32
212 ; VI: buffer_store_dword
213 define void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
214 %y = fadd <2 x half> %x,
215 store <2 x half> %y, <2 x half> addrspace(1)* %out
216 ret void
217 }
218
219 ; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_v2f16:
220 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
221 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2.0{{$}}
222 ; GFX9: buffer_store_dword [[REG]]
223
224 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
225 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
226 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL0]]
227 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL1]]
228 ; VI: v_or_b32
229 ; VI: buffer_store_dword
230 define void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
231 %y = fadd <2 x half> %x,
232 store <2 x half> %y, <2 x half> addrspace(1)* %out
233 ret void
234 }
235
236 ; GCN-LABEL: {{^}}add_inline_imm_4.0_v2f16:
237 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
238 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 4.0{{$}}
239 ; GFX9: buffer_store_dword [[REG]]
240
241 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
242 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
243 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL0]]
244 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL1]]
245 ; VI: v_or_b32
246 ; VI: buffer_store_dword
247 define void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
248 %y = fadd <2 x half> %x,
249 store <2 x half> %y, <2 x half> addrspace(1)* %out
250 ret void
251 }
252
253 ; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_v2f16:
254 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
255 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -4.0{{$}}
256 ; GFX9: buffer_store_dword [[REG]]
257
258 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
259 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
260 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL0]]
261 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL1]]
262 ; VI: v_or_b32
263 ; VI: buffer_store_dword
264 define void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
265 %y = fadd <2 x half> %x,
266 store <2 x half> %y, <2 x half> addrspace(1)* %out
267 ret void
268 }
269
270 ; GCN-LABEL: {{^}}commute_add_inline_imm_0.5_v2f16:
271 ; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
272 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5
273 ; GFX9: buffer_store_dword [[REG]]
274
275 ; VI: buffer_load_dword
276 ; VI-NOT: and
277 ; VI: v_lshrrev_b32_e32 {{v[0-9]+}}, 16,
278 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
279 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
280 ; VI: v_or_b32
281 ; VI: buffer_store_dword
282 define void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
283 %x = load <2 x half>, <2 x half> addrspace(1)* %in
284 %y = fadd <2 x half> %x,
285 store <2 x half> %y, <2 x half> addrspace(1)* %out
286 ret void
287 }
288
289 ; GCN-LABEL: {{^}}commute_add_literal_v2f16:
290 ; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
291 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x64006400
292 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[K]], [[VAL]]
293 ; GFX9: buffer_store_dword [[REG]]
294
295 ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
296 ; VI-DAG: buffer_load_dword
297 ; VI-NOT: and
298 ; VI: v_lshrrev_b32_e32 {{v[0-9]+}}, 16,
299 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
300 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
301 ; VI: v_or_b32
302 ; VI: buffer_store_dword
303 define void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
304 %x = load <2 x half>, <2 x half> addrspace(1)* %in
305 %y = fadd <2 x half> %x,
306 store <2 x half> %y, <2 x half> addrspace(1)* %out
307 ret void
308 }
309
310 ; GCN-LABEL: {{^}}add_inline_imm_1_v2f16:
311 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
312 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1{{$}}
313 ; GFX9: buffer_store_dword [[REG]]
314
315 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
316 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
317 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL0]]
318 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL1]]
319 ; VI: v_or_b32
320 ; VI: buffer_store_dword
321 define void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
322 %y = fadd <2 x half> %x,
323 store <2 x half> %y, <2 x half> addrspace(1)* %out
324 ret void
325 }
326
327 ; GCN-LABEL: {{^}}add_inline_imm_2_v2f16:
328 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
329 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2{{$}}
330 ; GFX9: buffer_store_dword [[REG]]
331
332 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
333 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
334 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL0]]
335 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL1]]
336 ; VI: v_or_b32
337 ; VI: buffer_store_dword
338 define void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
339 %y = fadd <2 x half> %x,
340 store <2 x half> %y, <2 x half> addrspace(1)* %out
341 ret void
342 }
343
344 ; GCN-LABEL: {{^}}add_inline_imm_16_v2f16:
345 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
346 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 16{{$}}
347 ; GFX9: buffer_store_dword [[REG]]
348
349 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
350 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
351 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL0]]
352 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL1]]
353 ; VI: v_or_b32
354 ; VI: buffer_store_dword
355 define void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
356 %y = fadd <2 x half> %x,
357 store <2 x half> %y, <2 x half> addrspace(1)* %out
358 ret void
359 }
360
361 ; GCN-LABEL: {{^}}add_inline_imm_neg_1_v2f16:
362 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
363 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1{{$}}
364 ; GFX9: buffer_store_dword [[REG]]
365
366 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
367 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
368 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL0]]
369 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL1]]
370 ; VI: v_or_b32
371 ; VI: buffer_store_dword
372 define void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
373 %y = fadd <2 x half> %x,
374 store <2 x half> %y, <2 x half> addrspace(1)* %out
375 ret void
376 }
377
378 ; GCN-LABEL: {{^}}add_inline_imm_neg_2_v2f16:
379 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
380 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2{{$}}
381 ; GFX9: buffer_store_dword [[REG]]
382
383 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
384 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
385 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL0]]
386 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL1]]
387 ; VI: v_or_b32
388 ; VI: buffer_store_dword
389 define void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
390 %y = fadd <2 x half> %x,
391 store <2 x half> %y, <2 x half> addrspace(1)* %out
392 ret void
393 }
394
395 ; GCN-LABEL: {{^}}add_inline_imm_neg_16_v2f16:
396 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
397 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -16{{$}}
398 ; GFX9: buffer_store_dword [[REG]]
399
400 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
401 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
402 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL0]]
403 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL1]]
404 ; VI: v_or_b32
405 ; VI: buffer_store_dword
406 define void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
407 %y = fadd <2 x half> %x,
408 store <2 x half> %y, <2 x half> addrspace(1)* %out
409 ret void
410 }
411
412 ; GCN-LABEL: {{^}}add_inline_imm_63_v2f16:
413 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
414 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 63
415 ; GFX9: buffer_store_dword [[REG]]
416
417 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
418 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
419 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL0]]
420 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL1]]
421 ; VI: v_or_b32
422 ; VI: buffer_store_dword
423 define void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
424 %y = fadd <2 x half> %x,
425 store <2 x half> %y, <2 x half> addrspace(1)* %out
426 ret void
427 }
428
429 ; GCN-LABEL: {{^}}add_inline_imm_64_v2f16:
430 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
431 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 64
432 ; GFX9: buffer_store_dword [[REG]]
433
434 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
435 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
436 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL0]]
437 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL1]]
438 ; VI: v_or_b32
439 ; VI: buffer_store_dword
440 define void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
441 %y = fadd <2 x half> %x,
442 store <2 x half> %y, <2 x half> addrspace(1)* %out
443 ret void
444 }
445
446 attributes #0 = { nounwind }
None ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=fiji -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI %s
0 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx901 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
1 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=fiji -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s
12 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s
23
34 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0:
56
67 ; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
78 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x3e7{{$}}
9
10 ; GFX9-NOT: lshr
11 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, 0x3e7, [[VEC]]
812 define void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
913 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
1014 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
1923 ; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
2024 ; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
2125 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
26
27 ; GFX9-NOT: [[ELT0]]
28 ; GFX9-NOT: [[VEC]]
29 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT0]], [[VEC]]
2230 define void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
2331 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
2432 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
2634 ret void
2735 }
2836
37 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_multi_use_hi_reg:
38 ; GCN: s_load_dword [[ELT0:s[0-9]+]]
39 ; GCN: s_load_dword [[VEC:s[0-9]+]]
40
41 ; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
42 ; CIVI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
43 ; CIVI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16
44 ; CIVI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
45 ; CIVI-DAG: ; use [[SHR]]
46
47 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
48 ; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
49 ; GFX9-DAG: ; use [[ELT1]]
50 define void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
51 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
52 %elt1 = extractelement <2 x i16> %vec, i32 1
53 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
54 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
55 %use1 = zext i16 %elt1 to i32
56 call void asm sideeffect "; use $0", "s"(i32 %use1) #0
57 ret void
58 }
59
2960 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi:
30 ; GCN: s_load_dword [[ELT0:s[0-9]+]]
31 ; GCN: s_load_dword [[VEC:s[0-9]+]]
61 ; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
62 ; GCN: s_load_dword [[VEC:s[0-9]+]]
63
64 ; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
65 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_ARG]], [[ELT1]]
66
67 ; GFX9-NOT: [[ELT0]]
68 ; GFX9-NOT: [[VEC]]
69 ; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]]
70 define void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
71 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
72 %elt.hi = lshr i32 %elt.arg, 16
73 %elt = trunc i32 %elt.hi to i16
74 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
75 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
76 ret void
77 }
78
79 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_multi_use_1:
80 ; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
81 ; GCN: s_load_dword [[VEC:s[0-9]+]],
3282
3383 ; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
3484 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
35 define void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
85
86 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16
87 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT1]], [[VEC]]
88 ; GFX9: ; use [[ELT1]]
89 define void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
3690 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
3791 %elt.hi = lshr i32 %elt.arg, 16
3892 %elt = trunc i32 %elt.hi to i16
3993 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
4094 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
95 %use1 = zext i16 %elt to i32
96 call void asm sideeffect "; use $0", "s"(i32 %use1) #0
97 ret void
98 }
99
100 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_both_multi_use_1:
101 ; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
102 ; GCN: s_load_dword [[VEC:s[0-9]+]],
103
104 ; CIVI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
105 ; CIVI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
106 ; CIVI-DAG: s_lshl_b32 [[VEC_HI:s[0-9]+]], [[SHR]], 16
107 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]]
108
109 ; GFX9-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
110 ; GFX9-DAG: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[VEC]], 16
111 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]]
112 ; GFX9: ; use [[ELT_HI]]
113 ; GFX9: ; use [[VEC_HI]]
114 define void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
115 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
116 %elt.hi = lshr i32 %elt.arg, 16
117 %elt = trunc i32 %elt.hi to i16
118 %vec.hi = extractelement <2 x i16> %vec, i32 1
119 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
120 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
121 %use1 = zext i16 %elt to i32
122 %vec.hi.use1 = zext i16 %vec.hi to i32
123
124 call void asm sideeffect "; use $0", "s"(i32 %use1) #0
125 call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0
41126 ret void
42127 }
43128
45130 ; GCN: s_load_dword [[VEC:s[0-9]+]]
46131
47132 ; GCN-NOT: s_lshr
48 ; GCN: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
49 ; GCN: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x3e70000
133
134 ; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
135 ; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x3e70000
136
137 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x3e7
50138 define void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
51139 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
52140 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
62150 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
63151
64152 ; GCN-NOT: shlr
153 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1]]
65154 define void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
66155 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
67156 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
73162 ; GCN: s_load_dword [[VEC:s[0-9]+]]
74163 ; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC:s[0-9]+]], 0xffff0000
75164 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x4500
165
166 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
167 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, 0x4500, [[ELT1]]
76168 define void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
77169 %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
78170 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
81173 }
82174
83175 ; GCN-LABEL: {{^}}s_insertelement_v2f16_1:
84 ; GCN: s_load_dword [[VEC:s[0-9]+]]
176 ; GFX9: s_load_dword [[VEC:s[0-9]+]]
85177 ; GCN-NOT: s_lshr
86 ; GCN: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
87 ; GCN: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x45000000
178
179 ; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
180 ; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x45000000
181
182 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x4500
88183 define void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
89184 %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
90185 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
96191 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
97192 ; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
98193 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e7, [[ELT1]]
194
195 ; GFX9-DAG: s_movk_i32 [[ELT0:s[0-9]+]], 0x3e7{{$}}
196 ; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}}
197 ; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], [[ELT0]], [[VEC]]
99198 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
100199 define void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
101200 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
116215 ; CIVI-DAG: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
117216 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], [[ELT0_SHIFT]], [[ELT1]]
118217
218 ; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}}
219 ; GFX9-DAG: v_lshrrev_b32_e64 [[ELT0_SHIFT:v[0-9]+]], 16, [[ELT0]]
220 ; GFX9: v_and_or_b32 [[RES:v[0-9]+]], [[VEC]], [[MASK]], [[ELT0_SHIFT]]
221
119222 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
120223 define void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 {
121224 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
136239 ; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
137240 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 53, [[ELT1]]
138241
242 ; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}}
243 ; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], 53, [[VEC]]
244
139245 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
140246 define void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
141247 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
153259 ; GCN-LABEL: {{^}}v_insertelement_v2i16_1:
154260 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
155261 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]]
262
263 ; GFX9: s_movk_i32 [[K:s[0-9]+]], 0x3e7
264 ; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
265 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
156266
157267 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
158268 define void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
170280 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
171281 ; GCN: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
172282 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]]
173
283 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], -15, 16, [[ELT0]]
174284 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
175285 define void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
176286 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
188298
189299 ; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
190300 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x4500, [[ELT1]]
301
302 ; GFX9: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0x4500{{$}}
303 ; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VEC]]
304 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, [[ELT0]]
191305
192306 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
193307 define void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
207321 ; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
208322 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 53, [[ELT1]]
209323
324 ; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VEC]]
325 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, 53
210326 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
211327 define void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
212328 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
223339 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
224340 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]]
225341
342 ; GFX9: s_movk_i32 [[K:s[0-9]+]], 0x4500
343 ; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
344 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
345
226346 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
227347 define void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
228348 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
239359 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
240360 ; GCN: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
241361 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]]
242
362 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], 35, 16, [[ELT0]]
243363 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
244364 define void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
245365 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
294414 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
295415 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
296416
297 ; VI-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
298 ; VI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
299 ; VI: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
417 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
418 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
419 ; GFX89: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
300420
301421 ; CI: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
302422 ; CI: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
321441 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
322442 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
323443
324 ; VI-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
325 ; VI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
326 ; VI: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
444 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
445 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
446 ; GFX89: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
327447
328448 ; CI: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
329449 ; CI: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
11 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
23
34 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
45 declare void @llvm.amdgcn.exp.compr.v2i16(i32, i32, <2 x i16>, <2 x i16>, i1, i1) #0
0 ; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
3
4 ; GCN-LABEL: {{^}}s_lshr_v2i16:
5 ; GFX9: s_load_dword [[LHS:s[0-9]+]]
6 ; GFX9: s_load_dword [[RHS:s[0-9]+]]
7 ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
8 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
9
10 ; CIVI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
11 ; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
12 ; CIVI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
13 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
14 define void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
15 %result = lshr <2 x i16> %lhs, %rhs
16 store <2 x i16> %result, <2 x i16> addrspace(1)* %out
17 ret void
18 }
19
20 ; GCN-LABEL: {{^}}v_lshr_v2i16:
21 ; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
22 ; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
23 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
24
25 ; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
26 ; VI: v_lshrrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
27 ; VI: v_lshrrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
28 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
29 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
30
31 ; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
32 ; CI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[LHS]]
33 ; CI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[RHS]]
34 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
35 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
36 ; CI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
37 ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
38 ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
39 ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
40 define void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
41 %tid = call i32 @llvm.amdgcn.workitem.id.x()
42 %tid.ext = sext i32 %tid to i64
43 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
44 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
45 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1
46 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
47 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
48 %result = lshr <2 x i16> %a, %b
49 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
50 ret void
51 }
52
53 ; GCN-LABEL: {{^}}lshr_v_s_v2i16:
54 ; GFX9: s_load_dword [[RHS:s[0-9]+]]
55 ; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
56 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
57 define void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
58 %tid = call i32 @llvm.amdgcn.workitem.id.x()
59 %tid.ext = sext i32 %tid to i64
60 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
61 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
62 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
63 %result = lshr <2 x i16> %vgpr, %sgpr
64 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
65 ret void
66 }
67
68 ; GCN-LABEL: {{^}}lshr_s_v_v2i16:
69 ; GFX9: s_load_dword [[LHS:s[0-9]+]]
70 ; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
71 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
72 define void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
73 %tid = call i32 @llvm.amdgcn.workitem.id.x()
74 %tid.ext = sext i32 %tid to i64
75 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
76 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
77 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
78 %result = lshr <2 x i16> %sgpr, %vgpr
79 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
80 ret void
81 }
82
83 ; GCN-LABEL: {{^}}lshr_imm_v_v2i16:
84 ; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
85 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], 8
86 define void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
87 %tid = call i32 @llvm.amdgcn.workitem.id.x()
88 %tid.ext = sext i32 %tid to i64
89 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
90 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
91 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
92 %result = lshr <2 x i16> , %vgpr
93 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
94 ret void
95 }
96
97 ; GCN-LABEL: {{^}}lshr_v_imm_v2i16:
98 ; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
99 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], 8, [[LHS]]
100 define void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
101 %tid = call i32 @llvm.amdgcn.workitem.id.x()
102 %tid.ext = sext i32 %tid to i64
103 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
104 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
105 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
106 %result = lshr <2 x i16> %vgpr,
107 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
108 ret void
109 }
110
111 ; GCN-LABEL: {{^}}v_lshr_v4i16:
112 ; GCN: {{buffer|flat}}_load_dwordx2
113 ; GCN: {{buffer|flat}}_load_dwordx2
114 ; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
115 ; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
116 ; GCN: {{buffer|flat}}_store_dwordx2
117 define void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
118 %tid = call i32 @llvm.amdgcn.workitem.id.x()
119 %tid.ext = sext i32 %tid to i64
120 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
121 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
122 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1
123 %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
124 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
125 %result = lshr <4 x i16> %a, %b
126 store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
127 ret void
128 }
129
130 ; GCN-LABEL: {{^}}lshr_v_imm_v4i16:
131 ; GCN: {{buffer|flat}}_load_dwordx2
132 ; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
133 ; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
134 ; GCN: {{buffer|flat}}_store_dwordx2
135 define void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
136 %tid = call i32 @llvm.amdgcn.workitem.id.x()
137 %tid.ext = sext i32 %tid to i64
138 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
139 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
140 %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
141 %result = lshr <4 x i16> %vgpr,
142 store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
143 ret void
144 }
145
146 declare i32 @llvm.amdgcn.workitem.id.x() #1
147
148 attributes #0 = { nounwind }
149 attributes #1 = { nounwind readnone }
None ; RUN: llc < %s -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s
1
2
3 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
0 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VIPLUS %s
1 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=VIPLUS %s
42
53 ; FIXME: Need to handle non-uniform case for function below (load without gep).
64 ; GCN-LABEL: {{^}}v_test_imax_sge_i16:
7 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
5 ; VIPLUS: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
86 define void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
97 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
108 %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
1917 }
2018
2119 ; FIXME: Need to handle non-uniform case for function below (load without gep).
20 ; GCN-LABEL: {{^}}v_test_imax_sge_v2i16:
21 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
22 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
23
24 ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
25 define void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind {
26 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
27 %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid
28 %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid
29 %outgep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
30 %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep0, align 4
31 %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep1, align 4
32 %cmp = icmp sge <2 x i16> %a, %b
33 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
34 store <2 x i16> %val, <2 x i16> addrspace(1)* %outgep, align 4
35 ret void
36 }
37
38 ; FIXME: Need to handle non-uniform case for function below (load without gep).
39 ; GCN-LABEL: {{^}}v_test_imax_sge_v3i16:
40 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
41 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
42 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
43 ; VI-NOT: v_max_i16
44
45 ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
46 ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
47 define void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %aptr, <3 x i16> addrspace(1)* %bptr) nounwind {
48 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
49 %gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid
50 %gep1 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %bptr, i32 %tid
51 %outgep = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %out, i32 %tid
52 %a = load <3 x i16>, <3 x i16> addrspace(1)* %gep0, align 4
53 %b = load <3 x i16>, <3 x i16> addrspace(1)* %gep1, align 4
54 %cmp = icmp sge <3 x i16> %a, %b
55 %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
56 store <3 x i16> %val, <3 x i16> addrspace(1)* %outgep, align 4
57 ret void
58 }
59
60 ; FIXME: Need to handle non-uniform case for function below (load without gep).
2261 ; GCN-LABEL: {{^}}v_test_imax_sge_v4i16:
2362 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
2463 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}