llvm.org GIT mirror llvm / 7f5f318
[AMDGPU] gfx1010: use fmac instructions Differential Revision: https://reviews.llvm.org/D61527 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@359959 91177308-0d34-0410-b5e6-96231b3b80d8 Stanislav Mekhanoshin 1 year, 5 months ago
11 changed file(s) with 1107 addition(s) and 332 deletion(s). Raw diff Collapse all Expand all
520520
521521 // F16 - VOP3 Actions.
522522 setOperationAction(ISD::FMA, MVT::f16, Legal);
523 if (!Subtarget->hasFP16Denormals())
523 if (!Subtarget->hasFP16Denormals() && STI.hasMadF16())
524524 setOperationAction(ISD::FMAD, MVT::f16, Legal);
525525
526526 for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
87228722
87238723 // Only do this if we are not trying to support denormals. v_mad_f32 does not
87248724 // support denormals ever.
8725 if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
8726 (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
8725 if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
8726 (VT == MVT::f16 && !Subtarget->hasFP16Denormals() &&
8727 getSubtarget()->hasMadF16())) &&
8728 isOperationLegal(ISD::FMAD, VT))
87278729 return ISD::FMAD;
87288730
87298731 const TargetOptions &Options = DAG.getTarget().Options;
20702070 }
20712071
20722072 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
2073 Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
2073 Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 ||
2074 Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
2075 Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64) {
20742076 // Don't fold if we are using source or output modifiers. The new VOP2
20752077 // instructions don't have them.
20762078 if (hasAnyModifiersSet(UseMI))
20852087 if (isInlineConstant(UseMI, *Src0, *ImmOp))
20862088 return false;
20872089
2088 bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
2090 bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
2091 Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64;
2092 bool IsFMA = Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
2093 Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64;
20892094 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
20902095 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
20912096
20982103 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
20992104 return false;
21002105
2106 unsigned NewOpc =
2107 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16)
2108 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
2109 if (pseudoToMCOpcode(NewOpc) == -1)
2110 return false;
2111
21012112 // We need to swap operands 0 and 1 since madmk constant is at operand 1.
21022113
21032114 const int64_t Imm = ImmOp->getImm();
21182129 Src0->setIsKill(Src1->isKill());
21192130
21202131 if (Opc == AMDGPU::V_MAC_F32_e64 ||
2121 Opc == AMDGPU::V_MAC_F16_e64)
2132 Opc == AMDGPU::V_MAC_F16_e64 ||
2133 Opc == AMDGPU::V_FMAC_F32_e64 ||
2134 Opc == AMDGPU::V_FMAC_F16_e64)
21222135 UseMI.untieRegOperand(
21232136 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
21242137
21252138 Src1->ChangeToImmediate(Imm);
21262139
21272140 removeModOperands(UseMI);
2128 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
2141 UseMI.setDesc(get(NewOpc));
21292142
21302143 bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
21312144 if (DeleteDef)
21752188 // VGPR is okay as Src1 - fallthrough
21762189 }
21772190
2191 unsigned NewOpc =
2192 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16)
2193 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
2194 if (pseudoToMCOpcode(NewOpc) == -1)
2195 return false;
2196
21782197 const int64_t Imm = ImmOp->getImm();
21792198
21802199 // FIXME: This would be a lot easier if we could return a new instruction
21872206 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
21882207
21892208 if (Opc == AMDGPU::V_MAC_F32_e64 ||
2190 Opc == AMDGPU::V_MAC_F16_e64)
2209 Opc == AMDGPU::V_MAC_F16_e64 ||
2210 Opc == AMDGPU::V_FMAC_F32_e64 ||
2211 Opc == AMDGPU::V_FMAC_F16_e64)
21912212 UseMI.untieRegOperand(
21922213 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
21932214
21962217
21972218 // These come before src2.
21982219 removeModOperands(UseMI);
2199 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
2220 UseMI.setDesc(get(NewOpc));
22002221
22012222 bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
22022223 if (DeleteDef)
23092330 LiveVariables *LV) const {
23102331 unsigned Opc = MI.getOpcode();
23112332 bool IsF16 = false;
2312 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
2333 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
2334 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64;
23132335
23142336 switch (Opc) {
23152337 default:
23162338 return nullptr;
23172339 case AMDGPU::V_MAC_F16_e64:
2340 case AMDGPU::V_FMAC_F16_e64:
23182341 IsF16 = true;
23192342 LLVM_FALLTHROUGH;
23202343 case AMDGPU::V_MAC_F32_e64:
23212344 case AMDGPU::V_FMAC_F32_e64:
23222345 break;
23232346 case AMDGPU::V_MAC_F16_e32:
2347 case AMDGPU::V_FMAC_F16_e32:
23242348 IsF16 = true;
23252349 LLVM_FALLTHROUGH;
23262350 case AMDGPU::V_MAC_F32_e32:
23492373 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
23502374 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
23512375
2352 if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
2376 if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
23532377 // If we have an SGPR input, we will violate the constant bus restriction.
23542378 (ST.getConstantBusLimit(Opc) > 1 ||
23552379 !Src0->isReg() ||
23562380 !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
23572381 if (auto Imm = getFoldableImm(Src2)) {
2358 return BuildMI(*MBB, MI, MI.getDebugLoc(),
2359 get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
2360 .add(*Dst)
2361 .add(*Src0)
2362 .add(*Src1)
2363 .addImm(Imm);
2364 }
2382 unsigned NewOpc =
2383 IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
2384 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
2385 if (pseudoToMCOpcode(NewOpc) != -1)
2386 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
2387 .add(*Dst)
2388 .add(*Src0)
2389 .add(*Src1)
2390 .addImm(Imm);
2391 }
2392 unsigned NewOpc =
2393 IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
2394 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
23652395 if (auto Imm = getFoldableImm(Src1)) {
2366 return BuildMI(*MBB, MI, MI.getDebugLoc(),
2367 get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
2368 .add(*Dst)
2369 .add(*Src0)
2370 .addImm(Imm)
2371 .add(*Src2);
2396 if (pseudoToMCOpcode(NewOpc) != -1)
2397 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
2398 .add(*Dst)
2399 .add(*Src0)
2400 .addImm(Imm)
2401 .add(*Src2);
23722402 }
23732403 if (auto Imm = getFoldableImm(Src0)) {
2374 if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
2404 if (pseudoToMCOpcode(NewOpc) != -1 &&
2405 isOperandLegal(MI, AMDGPU::getNamedOperandIdx(NewOpc,
23752406 AMDGPU::OpName::src0), Src1))
2376 return BuildMI(*MBB, MI, MI.getDebugLoc(),
2377 get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
2407 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
23782408 .add(*Dst)
23792409 .add(*Src1)
23802410 .addImm(Imm)
23822412 }
23832413 }
23842414
2385 assert((!IsFMA || !IsF16) && "fmac only expected with f32");
2386 unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
2387 (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
2415 unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32)
2416 : (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
2417 if (pseudoToMCOpcode(NewOpc) == -1)
2418 return nullptr;
2419
23882420 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
23892421 .add(*Dst)
23902422 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
26772709 case AMDGPU::V_MAC_F32_e64:
26782710 case AMDGPU::V_MAC_F16_e64:
26792711 case AMDGPU::V_FMAC_F32_e64:
2712 case AMDGPU::V_FMAC_F16_e64:
26802713 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
26812714 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
26822715 return false;
34093442 MachineBasicBlock *MBB = MI.getParent();
34103443 MachineOperand &MO = MI.getOperand(OpIdx);
34113444 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
3445 const SIRegisterInfo *TRI =
3446 static_cast(MRI.getTargetRegisterInfo());
34123447 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
34133448 const TargetRegisterClass *RC = RI.getRegClass(RCID);
3414 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
3449 unsigned Size = TRI->getRegSizeInBits(*RC);
3450 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
34153451 if (MO.isReg())
34163452 Opcode = AMDGPU::COPY;
34173453 else if (RI.isSGPRClass(RC))
3418 Opcode = AMDGPU::S_MOV_B32;
3454 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
34193455
34203456 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
34213457 if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
53315367 }
53325368
53335369 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
5370 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
5371 return (16ULL << 44) | // IMG_FORMAT_32_FLOAT
5372 (1ULL << 56) | // RESOURCE_LEVEL = 1
5373 (3ULL << 60); // OOB_SELECT = 3
5374 }
5375
53345376 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
53355377 if (ST.isAmdHsaOS()) {
53365378 // Set ATC = 1. GFX9 doesn't have this bit.
53575399 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
53585400 }
53595401
5360 // IndexStride = 64.
5361 Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
5402 // IndexStride = 64 / 32.
5403 uint64_t IndexStride = ST.getGeneration() <= AMDGPUSubtarget::GFX9 ? 3 : 2;
5404 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
53625405
53635406 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
53645407 // Clear them unless we want a huge stride.
5365 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
5408 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
5409 ST.getGeneration() <= AMDGPUSubtarget::GFX9)
53665410 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
53675411
53685412 return Rsrc23;
14611461
14621462 def : GCNPat<
14631463 (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
1464 (V_PK_MUL_F16 0, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
1464 (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
14651465 >;
14661466 }
14671467
15221522 >;
15231523 } // End OtherPredicates = [HasDLInsts]
15241524
1525 let SubtargetPredicate = isGFX10Plus in
1526 def : GCNPat <
1527 (fma (f16 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
1528 (f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
1529 (f16 (VOP3NoMods f32:$src2))),
1530 (V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
1531 SRCMODS.NONE, $src2, $clamp, $omod)
1532 >;
15251533
15261534 // Allow integer inputs
15271535 class ExpPattern : GCNPat<
417417 }
418418 assert(Src && Src->isReg());
419419
420 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
420 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
421 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
422 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
421423 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
422424 !isSameReg(*Src, *getReplacedOperand())) {
423425 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
459461 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
460462 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
461463
462 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
464 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
465 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
466 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
463467 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
464468 getDstSel() != AMDGPU::SDWA::DWORD) {
465469 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
963967 return false;
964968 }
965969
966 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
970 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
971 Opc == AMDGPU::V_FMAC_F32_e32 ||
972 Opc == AMDGPU::V_MAC_F16_e32 ||
967973 Opc == AMDGPU::V_MAC_F32_e32))
974 return false;
975
976 // Check if target supports this SDWA opcode
977 if (TII->pseudoToMCOpcode(Opc) == -1)
968978 return false;
969979
970980 // FIXME: has SDWA but require handling of implicit VCC use
10371047 SDWAInst.add(*Src1);
10381048 }
10391049
1040 if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1050 if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1051 SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1052 SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
10411053 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
10421054 // v_mac_f16/32 has additional src2 operand tied to vdst
10431055 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=+fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s
2 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=+fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s
2 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s
45
56 ; Make sure fdiv is promoted to f32.
67
2021 ; SI: v_div_fixup_f32
2122 ; SI: v_cvt_f16_f32
2223
23 ; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
24 ; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
25
26 ; GFX8_9-DAG: v_cvt_f32_f16_e32 [[CVT_LHS:v[0-9]+]], [[LHS]]
27 ; GFX8_9-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]]
28
29 ; GFX8_9-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]]
30 ; GFX8_9: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]]
31 ; GFX8_9: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]]
32 ; GFX8_9: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]]
33 ; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
24 ; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
25 ; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
26
27 ; GFX8_9_10-DAG: v_cvt_f32_f16_e32 [[CVT_LHS:v[0-9]+]], [[LHS]]
28 ; GFX8_9_10-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]]
29
30 ; GFX8_9_10-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]]
31 ; GFX8_9_10: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]]
32 ; GFX8_9_10: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]]
33 ; GFX8_9_10: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]]
34 ; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
3435 define amdgpu_kernel void @v_fdiv_f16(
3536 half addrspace(1)* %r,
3637 half addrspace(1)* %a,
4950 }
5051
5152 ; GCN-LABEL: {{^}}v_rcp_f16:
52 ; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
53 ; GFX8_9-NOT: [[VAL]]
54 ; GFX8_9: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
55 ; GFX8_9-NOT: [[RESULT]]
56 ; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
53 ; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
54 ; GFX8_9_10-NOT: [[VAL]]
55 ; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
56 ; GFX8_9_10-NOT: [[RESULT]]
57 ; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
5758 define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
5859 entry:
5960 %tid = call i32 @llvm.amdgcn.workitem.id.x()
6768 }
6869
6970 ; GCN-LABEL: {{^}}v_rcp_f16_abs:
70 ; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
71 ; GFX8_9-NOT: [[VAL]]
72 ; GFX8_9: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]|
73 ; GFX8_9-NOT: [RESULT]]
74 ; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
71 ; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
72 ; GFX8_9_10-NOT: [[VAL]]
73 ; GFX8_9_10: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]|
74 ; GFX8_9_10-NOT: [RESULT]]
75 ; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
7576 define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
7677 entry:
7778 %tid = call i32 @llvm.amdgcn.workitem.id.x()
8687 }
8788
8889 ; GCN-LABEL: {{^}}v_rcp_f16_arcp:
89 ; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
90 ; GFX8_9-NOT: [[VAL]]
91 ; GFX8_9: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
92 ; GFX8_9-NOT: [[RESULT]]
93 ; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
90 ; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
91 ; GFX8_9_10-NOT: [[VAL]]
92 ; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
93 ; GFX8_9_10-NOT: [[RESULT]]
94 ; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
9495 define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
9596 entry:
9697 %tid = call i32 @llvm.amdgcn.workitem.id.x()
104105 }
105106
106107 ; GCN-LABEL: {{^}}v_rcp_f16_neg:
107 ; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
108 ; GFX8_9-NOT: [[VAL]]
109 ; GFX8_9: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]]
110 ; GFX8_9-NOT: [RESULT]]
111 ; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
108 ; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
109 ; GFX8_9_10-NOT: [[VAL]]
110 ; GFX8_9_10: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]]
111 ; GFX8_9_10-NOT: [RESULT]]
112 ; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
112113 define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
113114 entry:
114115 %tid = call i32 @llvm.amdgcn.workitem.id.x()
122123 }
123124
124125 ; GCN-LABEL: {{^}}v_rsq_f16:
125 ; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
126 ; GFX8_9-NOT: [[VAL]]
127 ; GFX8_9: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
128 ; GFX8_9-NOT: [RESULT]]
129 ; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
126 ; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
127 ; GFX8_9_10-NOT: [[VAL]]
128 ; GFX8_9_10: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
129 ; GFX8_9_10-NOT: [RESULT]]
130 ; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
130131 define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
131132 entry:
132133 %tid = call i32 @llvm.amdgcn.workitem.id.x()
141142 }
142143
143144 ; GCN-LABEL: {{^}}v_rsq_f16_neg:
144 ; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
145 ; GFX8_9-NOT: [[VAL]]
146 ; GFX8_9: v_sqrt_f16_e32 [[SQRT:v[0-9]+]], [[VAL]]
147 ; GFX8_9-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]]
148 ; GFX8_9-NOT: [RESULT]]
149 ; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
145 ; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
146 ; GFX8_9_10-NOT: [[VAL]]
147 ; GFX8_9_10: v_sqrt_f16_e32 [[SQRT:v[0-9]+]], [[VAL]]
148 ; GFX8_9_10-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]]
149 ; GFX8_9_10-NOT: [RESULT]]
150 ; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
150151 define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
151152 entry:
152153 %tid = call i32 @llvm.amdgcn.workitem.id.x()
161162 }
162163
163164 ; GCN-LABEL: {{^}}v_fdiv_f16_arcp:
164 ; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
165 ; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
166
167 ; GFX8_9: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
168 ; GFX8_9: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
169
170 ; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
165 ; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
166 ; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
167
168 ; GFX8_9_10: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
169 ; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
170
171 ; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
171172 define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
172173 entry:
173174 %tid = call i32 @llvm.amdgcn.workitem.id.x()
183184 }
184185
185186 ; GCN-LABEL: {{^}}v_fdiv_f16_unsafe:
186 ; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
187 ; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
188
189 ; GFX8_9: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
190 ; GFX8_9: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
191
192 ; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
187 ; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
188 ; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
189
190 ; GFX8_9_10: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
191 ; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
192
193 ; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
193194 define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
194195 entry:
195196 %tid = call i32 @llvm.amdgcn.workitem.id.x()
207208 ; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f16:
208209 ; SI: v_mul_f32_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
209210
210 ; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
211 ; GFX8_9: buffer_store_short [[MUL]]
211 ; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
212 ; GFX8_9_10: buffer_store_short [[MUL]]
212213 define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
213214 %x = load half, half addrspace(1)* undef
214215 %rcp = fdiv arcp half %x, 2.0
219220 ; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16:
220221 ; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dccc000, v{{[0-9]+}}
221222
222 ; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
223 ; GFX8_9: buffer_store_short [[MUL]]
223 ; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
224 ; GFX8_9_10: buffer_store_short [[MUL]]
224225 define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
225226 %x = load half, half addrspace(1)* undef
226227 %rcp = fdiv arcp half %x, 10.0
231232 ; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16:
232233 ; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdccc000, v{{[0-9]+}}
233234
234 ; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
235 ; GFX8_9: buffer_store_short [[MUL]]
235 ; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
236 ; GFX8_9_10: buffer_store_short [[MUL]]
236237 define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
237238 %x = load half, half addrspace(1)* undef
238239 %rcp = fdiv arcp half %x, -10.0
0 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s
1
2 ; GCN-LABEL: {{^}}addMul2D:
3 ; GFX1010: v_fmac_f16
4 ; GFX1010: v_fmac_f16
5 define hidden <4 x half> @addMul2D(<4 x i8>* nocapture readonly, float addrspace(4)* nocapture readonly, <2 x i32>, i32) local_unnamed_addr #0 {
6 %5 = extractelement <2 x i32> %2, i64 1
7 %6 = icmp sgt i32 %5, 0
8 br i1 %6, label %7, label %38
9
10 7: ; preds = %4
11 %8 = extractelement <2 x i32> %2, i64 0
12 %9 = icmp sgt i32 %8, 0
13 br label %10
14
15 10: ; preds = %34, %7
16 %11 = phi <4 x half> [ zeroinitializer, %7 ], [ %35, %34 ]
17 %12 = phi i32 [ 0, %7 ], [ %36, %34 ]
18 br i1 %9, label %13, label %34
19
20 13: ; preds = %10
21 %14 = mul nsw i32 %12, %3
22 %15 = mul nsw i32 %12, %8
23 br label %16
24
25 16: ; preds = %16, %13
26 %17 = phi <4 x half> [ %11, %13 ], [ %31, %16 ]
27 %18 = phi i32 [ 0, %13 ], [ %32, %16 ]
28 %19 = add nsw i32 %18, %14
29 %20 = sext i32 %19 to i64
30 %21 = getelementptr inbounds <4 x i8>, <4 x i8>* %0, i64 %20
31 %22 = load <4 x i8>, <4 x i8>* %21, align 4
32 %23 = tail call <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %22) #8
33 %24 = add nsw i32 %18, %15
34 %25 = sext i32 %24 to i64
35 %26 = getelementptr inbounds float, float addrspace(4)* %1, i64 %25
36 %27 = load float, float addrspace(4)* %26, align 4
37 %28 = fptrunc float %27 to half
38 %29 = insertelement <4 x half> undef, half %28, i32 0
39 %30 = shufflevector <4 x half> %29, <4 x half> undef, <4 x i32> zeroinitializer
40 %31 = tail call <4 x half> @llvm.fmuladd.v4f16(<4 x half> %23, <4 x half> %30, <4 x half> %17)
41 %32 = add nuw nsw i32 %18, 1
42 %33 = icmp eq i32 %32, %8
43 br i1 %33, label %34, label %16
44
45 34: ; preds = %16, %10
46 %35 = phi <4 x half> [ %11, %10 ], [ %31, %16 ]
47 %36 = add nuw nsw i32 %12, 1
48 %37 = icmp eq i32 %36, %5
49 br i1 %37, label %38, label %10
50
51 38: ; preds = %34, %4
52 %39 = phi <4 x half> [ zeroinitializer, %4 ], [ %35, %34 ]
53 ret <4 x half> %39
54 }
55
56 define linkonce_odr hidden <4 x half> @_Z13convert_half4Dv4_h(<4 x i8>) local_unnamed_addr #1 {
57 %2 = extractelement <4 x i8> %0, i64 0
58 %3 = uitofp i8 %2 to half
59 %4 = insertelement <4 x half> undef, half %3, i32 0
60 %5 = extractelement <4 x i8> %0, i64 1
61 %6 = uitofp i8 %5 to half
62 %7 = insertelement <4 x half> %4, half %6, i32 1
63 %8 = extractelement <4 x i8> %0, i64 2
64 %9 = uitofp i8 %8 to half
65 %10 = insertelement <4 x half> %7, half %9, i32 2
66 %11 = extractelement <4 x i8> %0, i64 3
67 %12 = uitofp i8 %11 to half
68 %13 = insertelement <4 x half> %10, half %12, i32 3
69 ret <4 x half> %13
70 }
71
72 declare <4 x half> @llvm.fmuladd.v4f16(<4 x half>, <4 x half>, <4 x half>)
73
74 attributes #0 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+dl-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx10-insts,+gfx9-insts,+s-memrealtime,-code-object-v3,-sram-ecc,-xnack" "unsafe-fp-math"="false" "use-soft-float"="false" }
75 attributes #1 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fp64-fp16-denormals,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
0 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s
11 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s
22
3 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s
4 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
3 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s
4 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
5
6 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GFX10-FLUSH,GFX10 %s
7 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GFX10-FLUSH,GFX10 %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s
9 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s
510
611 declare i32 @llvm.amdgcn.workitem.id.x() #1
712 declare half @llvm.fmuladd.f16(half, half, half) #1
1116 ; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
1217
1318 ; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
19
20 ; GFX10-FLUSH: v_mul_f16_e32
21 ; GFX10-FLUSH: v_add_f16_e32
22 ; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
23
1424 define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
1525 half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
1626 %r0 = load half, half addrspace(1)* %in1
2232 }
2333
2434 ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16
25 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
26 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
35 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
36 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
2737 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
2838 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
2939
30 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
40 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
41 ; GFX10-DENORM: v_fmac_f16_e32 [[R2:v[0-9]+]], 2.0, [[R1]]
42
43 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
44 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
45
46 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
47 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
48 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
49
50 define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
51 %tid = call i32 @llvm.amdgcn.workitem.id.x()
52 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
53 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
54 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
55
56 %r1 = load volatile half, half addrspace(1)* %gep.0
57 %r2 = load volatile half, half addrspace(1)* %gep.1
58
59 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2)
60 store half %r3, half addrspace(1)* %gep.out
61 ret void
62 }
63
64 ; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16
65 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
66 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
67 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
68 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
69
70 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
71 ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
72
73 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
74 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
75
3176 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
32 define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
33 %tid = call i32 @llvm.amdgcn.workitem.id.x()
34 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
35 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
36 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
37
38 %r1 = load volatile half, half addrspace(1)* %gep.0
39 %r2 = load volatile half, half addrspace(1)* %gep.1
40
41 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2)
42 store half %r3, half addrspace(1)* %gep.out
43 ret void
44 }
45
46 ; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16
47 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
48 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
77 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
78 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
79
80 define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
81 %tid = call i32 @llvm.amdgcn.workitem.id.x()
82 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
83 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
84 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
85
86 %r1 = load volatile half, half addrspace(1)* %gep.0
87 %r2 = load volatile half, half addrspace(1)* %gep.1
88
89 %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2)
90 store half %r3, half addrspace(1)* %gep.out
91 ret void
92 }
93
94 ; GCN-LABEL: {{^}}fadd_a_a_b_f16:
95 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
96 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
4997 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
5098 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
5199
52 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
100 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
101 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
102
103 ; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
104 ; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
105
53106 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
54 define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
55 %tid = call i32 @llvm.amdgcn.workitem.id.x()
56 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
57 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
58 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
59
60 %r1 = load volatile half, half addrspace(1)* %gep.0
61 %r2 = load volatile half, half addrspace(1)* %gep.1
62
63 %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2)
64 store half %r3, half addrspace(1)* %gep.out
65 ret void
66 }
67
68 ; GCN-LABEL: {{^}}fadd_a_a_b_f16:
69 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
70 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
71 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
72 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
73
74 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
75
76 ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
77 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
78
79 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
107
108 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
109 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
110 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
111 ; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
112 ; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
113
80114 define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
81115 half addrspace(1)* %in1,
82116 half addrspace(1)* %in2) #0 {
95129 }
96130
97131 ; GCN-LABEL: {{^}}fadd_b_a_a_f16:
98 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
99 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
132 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
133 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
100134 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
101135 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
102136
103 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
104
105 ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
106 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
137 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
138 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
139
140 ; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
141 ; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
107142
108143 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
144
145 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
146 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
147 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
148 ; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
149 ; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
150
109151 define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
110152 half addrspace(1)* %in1,
111153 half addrspace(1)* %in2) #0 {
124166 }
125167
126168 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16
127 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
128 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
169 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
170 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
171 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
172 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
173 ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
174 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
175 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
176 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
177 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
178 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
179 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
180 define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
181 %tid = call i32 @llvm.amdgcn.workitem.id.x()
182 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
183 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
184 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
185
186 %r1 = load volatile half, half addrspace(1)* %gep.0
187 %r2 = load volatile half, half addrspace(1)* %gep.1
188
189 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2)
190 store half %r3, half addrspace(1)* %gep.out
191 ret void
192 }
193
194 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16
195 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
196 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
197 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
198 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
199
200 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
201 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
202
203 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
204 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
205 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
206
207 ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
208 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
209 define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
210 %tid = call i32 @llvm.amdgcn.workitem.id.x()
211 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
212 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
213 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
214
215 %r1 = load volatile half, half addrspace(1)* %gep.0
216 %r2 = load volatile half, half addrspace(1)* %gep.1
217
218 %r1.fneg = fsub half -0.000000e+00, %r1
219
220 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2)
221 store half %r3, half addrspace(1)* %gep.out
222 ret void
223 }
224
225 ; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16
226 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
227 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
129228 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
130 ; VI-DENORM: v_fma_f16 [[R2:v[0-9]+]], [[R1]], -2.0, [[R2]]
131 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
132 define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
133 %tid = call i32 @llvm.amdgcn.workitem.id.x()
134 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
135 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
136 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
137
138 %r1 = load volatile half, half addrspace(1)* %gep.0
139 %r2 = load volatile half, half addrspace(1)* %gep.1
140
141 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2)
142 store half %r3, half addrspace(1)* %gep.out
143 ret void
144 }
145
146 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16
147 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
148 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
149 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
150 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
151
152 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
153 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
154 define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
229 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
230
231 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
232 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
233
234 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
235 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
236 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
237
238 ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
239 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
240 define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
155241 %tid = call i32 @llvm.amdgcn.workitem.id.x()
156242 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
157243 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
162248
163249 %r1.fneg = fsub half -0.000000e+00, %r1
164250
165 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2)
166 store half %r3, half addrspace(1)* %gep.out
167 ret void
168 }
169
170 ; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16
171 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
172 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
173 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
174 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
175
176 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
177 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
178 define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
179 %tid = call i32 @llvm.amdgcn.workitem.id.x()
180 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
181 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
182 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
183
184 %r1 = load volatile half, half addrspace(1)* %gep.0
185 %r2 = load volatile half, half addrspace(1)* %gep.1
186
187 %r1.fneg = fsub half -0.000000e+00, %r1
188
189251 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2)
190252 store half %r3, half addrspace(1)* %gep.out
191253 ret void
192254 }
193255
194256 ; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16
195 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
196 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
197 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
198 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
199 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
257 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
258 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
259 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
260 ; GCN-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
261 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
262 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
263 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
264 ; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
200265 define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
201266 %tid = call i32 @llvm.amdgcn.workitem.id.x()
202267 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
214279 }
215280
216281 ; GCN-LABEL: {{^}}mad_sub_f16:
217 ; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
218 ; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
219 ; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
282 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
283 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
284 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
220285
221286 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
222287
223 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
224
225 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
226 ; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
227
228 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
288 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
289
290 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
291 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
292
293 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
294
295 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
296 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
297 ; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
229298 define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
230299 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
231300 %tid.ext = sext i32 %tid to i64
245314 }
246315
247316 ; GCN-LABEL: {{^}}mad_sub_inv_f16:
248 ; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
249 ; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
250 ; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
317 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
318 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
319 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
251320 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
252321
253 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
254
255 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
256 ; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
257
258 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
322 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
323 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e64 [[REGC]], -[[REGA]], [[REGB]]
324
325 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
326 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
327
328 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
329
330 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
331 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
332 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
333 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
259334 define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
260335 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
261336 %tid.ext = sext i32 %tid to i64
275350 }
276351
277352 ; GCN-LABEL: {{^}}mad_sub_fabs_f16:
278 ; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
279 ; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
280 ; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
353 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
354 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
355 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
281356 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
282357
283 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
284
285 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
286 ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
287
288 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
358 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
359
360 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
361 ; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
362
363 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
364
365 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
366 ; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
367 ; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
289368 define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
290369 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
291370 %tid.ext = sext i32 %tid to i64
306385 }
307386
308387 ; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16:
309 ; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
310 ; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
311 ; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
388 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
389 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
390 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
312391
313392 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
314393
315 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
316
317 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
318 ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
319
320 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
394 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
395
396 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
397 ; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
398
399 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
400
401 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
402 ; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
403 ; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
321404 define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
322405 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
323406 %tid.ext = sext i32 %tid to i64
338421 }
339422
340423 ; GCN-LABEL: {{^}}neg_neg_mad_f16:
341 ; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
342 ; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
343 ; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
424 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
425 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
426 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
344427
345428 ; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
346429 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
347430
348 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
349
350 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
351 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
431 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
432 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
433
434 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
435 ; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
352436 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
437
438 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
439 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
440 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
441 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
353442 define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
354443 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
355444 %tid.ext = sext i32 %tid to i64
371460 }
372461
373462 ; GCN-LABEL: {{^}}mad_fabs_sub_f16:
374 ; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
375 ; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
376 ; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
463 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
464 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
465 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
377466
378467 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
379468
380 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
381
382 ; VI-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
383 ; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
384
385 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
469 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
470
471 ; GCN-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
472 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
473
474 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
475
476 ; GFX10-FLUSH: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
477 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
478 ; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
386479 define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
387480 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
388481 %tid.ext = sext i32 %tid to i64
403496 }
404497
405498 ; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16:
406 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
407 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
499 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
500 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
408501 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
409502 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
410503
411 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
412
413 ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
414 ; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
504 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
505 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
506
507 ; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
508 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
415509
416510 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
511
512 ; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
513 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
514 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
515 ; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
516 ; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
417517 define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
418518 %tid = call i32 @llvm.amdgcn.workitem.id.x()
419519 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
431531 }
432532
433533 ; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16:
434 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
435 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
534 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
535 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
436536
437537 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
438538
439 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
440
441 ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
442 ; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
443
444 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
539 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
540
541 ; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
542 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
543
544 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
545
546 ; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
547 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
548 ; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
445549 define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
446550 %tid = call i32 @llvm.amdgcn.workitem.id.x()
447551 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
11 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
22 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s
33 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX10 -check-prefix=GFX10-FLUSH %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX10 -check-prefix=GFX10-DENORM %s
46
57 declare half @llvm.fmuladd.f16(half %a, half %b, half %c)
68 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
2123
2224 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
2325 ; VI-DENORM: buffer_store_short [[RESULT]]
26
27 ; GFX10-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], v[[A_F16]], v[[B_F16]]
28 ; GFX10-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]]
29 ; GFX10-FLUSH: buffer_store_short [[ADD]]
30
31 ; GFX10-DENORM: v_fmac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]]
32 ; GFX10-DENORM: buffer_store_short v[[C_F16]],
2433
2534 ; GCN: s_endpgm
2635 define amdgpu_kernel void @fmuladd_f16(
5261 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[B_F16]], [[KA]], v[[C_F16]]
5362 ; VI-DENORM: buffer_store_short [[RESULT]]
5463
64 ; GFX10-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x4200, v[[B_F16]]
65 ; GFX10-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]]
66 ; GFX10-FLUSH: buffer_store_short [[ADD]]
67
68 ; GFX10-DENORM: v_fmac_f16_e32 v[[C_F16]], 0x4200, v[[B_F16]]
69 ; GFX10-DENORM: buffer_store_short v[[C_F16]],
70
5571 ; GCN: s_endpgm
5672 define amdgpu_kernel void @fmuladd_f16_imm_a(
5773 half addrspace(1)* %r,
8096 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], [[KA]], v[[C_F16]]
8197 ; VI-DENORM buffer_store_short [[RESULT]]
8298
99 ; GFX10-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x4200, v[[A_F16]]
100 ; GFX10-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]]
101 ; GFX10-FLUSH: buffer_store_short [[ADD]]
102
103 ; GFX10-DENORM: v_fmac_f16_e32 v[[C_F16]], 0x4200, v[[A_F16]]
104 ; GFX10-DENORM buffer_store_short v[[C_F16]],
83105
84106 ; GCN: s_endpgm
85107 define amdgpu_kernel void @fmuladd_f16_imm_b(
106128 ; VI-DENORM: buffer_load_dword v[[B_V2_F16:[0-9]+]]
107129 ; VI-DENORM: buffer_load_dword v[[C_V2_F16:[0-9]+]]
108130
131 ; GFX10: buffer_load_dword v[[A_V2_F16:[0-9]+]]
132 ; GFX10: buffer_load_dword v[[B_V2_F16:[0-9]+]]
133 ; GFX10: buffer_load_dword v[[C_V2_F16:[0-9]+]]
109134
110135 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
111136 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
114139
115140 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
116141 ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
117
118142
119143 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
120144 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
125149 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
126150 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
127151 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
128
129152
130153 ; VI-FLUSH: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
131154 ; VI-FLUSH-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
143166 ; VI-DENORM-NOT: v_and_b32
144167 ; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]]
145168
169 ; GFX10-FLUSH: v_pk_mul_f16 [[MUL:v[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
170 ; GFX10-FLUSH: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[MUL]], v[[C_V2_F16]]
171
172 ; GFX10-DENORM: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
173
146174 ; GCN: buffer_store_dword v[[R_V2_F16]]
147175 define amdgpu_kernel void @fmuladd_v2f16(
148176 <2 x half> addrspace(1)* %r,
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8_9 %s
0 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,MAD,GFX10-MAD %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA %s
35
46 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
57 declare float @llvm.fabs.f32(float) nounwind readnone
1113 ; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
1214 ; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
1315 ; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
14 ; GCN: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
16 ; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
17 ; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
18 ; MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
19 ; FMA: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
1520 define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
1621 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
1722 %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
3237 ; it.
3338
3439 ; GCN-LABEL: {{^}}madak_2_use_f32:
35 ; GFX8_9: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
36 ; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
37 ; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
38 ; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
39 ; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
40 ; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
41 ; GFX8_9: {{flat|global}}_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
42 ; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
43 ; GCN-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
44 ; GCN-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]]
45 ; GCN: s_endpgm
40 ; GFX8_9_10: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
41 ; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
42 ; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
43 ; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
44 ; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
45 ; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
46 ; GFX8_9_10: {{flat|global}}_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
47 ; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
48 ; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
49 ; GFX10-MAD-DAG:v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
50 ; FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
51 ; MAD-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]]
52 ; FMA-DAG: v_fmac_f32_e32 [[VK]], [[VA]], [[VC]]
53 ; GCN: s_endpgm
4654 define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
4755 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
4856
6977
7078 ; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
7179 ; GCN: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]]
72 ; GCN: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
80 ; MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
81 ; FMA: v_fmaak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
7382 define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
7483 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
7584 %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
93102 ; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
94103 ; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
95104 ; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
96 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
105 ; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
106 ; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
107 ; MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
108 ; FMA: v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
97109 define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
98110 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
99111 %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
111123
112124 ; We can't use an SGPR when forming madak
113125 ; GCN-LABEL: {{^}}s_v_madak_f32:
114 ; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
115 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
116 ; GCN-DAG: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]]
117 ; GCN-NOT: v_madak_f32
118 ; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
126 ; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
127 ; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
128 ; GCN-DAG: {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]]
129 ; GCN-NOT: v_madak_f32
130 ; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
131 ; GFX10-MAD: v_mad_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000
132 ; FMA: v_fma_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000
119133 define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
120134 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
121135 %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
130144 }
131145
132146 ; GCN-LABEL: @v_s_madak_f32
133 ; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
134 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
135 ; GCN-DAG: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]]
136 ; GCN-NOT: v_madak_f32
137 ; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
147 ; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
148 ; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
149 ; GCN-DAG: {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]]
150 ; GFX6_8_9-NOT: v_madak_f32
151 ; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
152 ; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
153 ; FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
138154 define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
139155 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
140156 %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
150166
151167 ; GCN-LABEL: {{^}}s_s_madak_f32:
152168 ; GCN-NOT: v_madak_f32
153 ; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
169 ; GFX8_9: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
170 ; GFX10-MAD: v_mac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
171 ; FMA: v_fmac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
154172 define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
155173 %mul = fmul float %a, %b
156174 %madak = fadd float %mul, 10.0
159177 }
160178
161179 ; GCN-LABEL: {{^}}no_madak_src0_modifier_f32:
162 ; GFX6: buffer_load_dword [[VA:v[0-9]+]]
163 ; GFX6: buffer_load_dword [[VB:v[0-9]+]]
164 ; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
165 ; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
166 ; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
167 ; GCN: s_endpgm
180 ; GFX6: buffer_load_dword [[VA:v[0-9]+]]
181 ; GFX6: buffer_load_dword [[VB:v[0-9]+]]
182 ; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
183 ; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
184 ; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
185 ; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000
186 ; FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000
187 ; GCN: s_endpgm
168188 define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
169189 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
170190 %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
183203 }
184204
185205 ; GCN-LABEL: {{^}}no_madak_src1_modifier_f32:
186 ; GFX6: buffer_load_dword [[VA:v[0-9]+]]
187 ; GFX6: buffer_load_dword [[VB:v[0-9]+]]
188 ; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
189 ; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
190 ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
191 ; GCN: s_endpgm
206 ; GFX6: buffer_load_dword [[VA:v[0-9]+]]
207 ; GFX6: buffer_load_dword [[VB:v[0-9]+]]
208 ; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
209 ; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
210 ; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
211 ; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000
212 ; FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000
213 ; GCN: s_endpgm
192214 define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
193215 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
194216 %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
206228 ret void
207229 }
208230
209 ; SIFoldOperands should not fold the SGPR copy into the instruction
231 ; SIFoldOperands should not fold the SGPR copy into the instruction before GFX10
210232 ; because the implicit immediate already uses the constant bus.
233 ; On GFX10+ we can use two scalar operands.
211234 ; GCN-LABEL: {{^}}madak_constant_bus_violation:
212 ; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
213 ; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
214 ; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]]
215 ; GCN: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
216 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
217 ; GFX6: buffer_store_dword [[MUL]]
218 ; GFX8_9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]]
235 ; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
236 ; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
237 ; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]]
238 ; MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
239 ; FMA: v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
240 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
241 ; GFX6: buffer_store_dword [[MUL]]
242 ; GFX8_9_10: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]]
219243 define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 {
220244 bb:
221245 %tmp = icmp eq i32 %arg1, 0
0 # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1010 -check-prefix=GCN %s
1
2 # GCN-LABEL: {{^}}name: vop1_instructions
3
4 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
5 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
6 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec
7 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
8 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
9
10 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_sdwa 0, %{{[0-9]+}}, 0, 6, 0, 5, implicit $exec
11 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
12 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec
13 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
14 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
15
16 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec
17 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 1, 0, 5, 0, 5, implicit $exec
18 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 1, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
19 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 1, 5, 0, 5, implicit $exec
20
21 ---
22 name: vop1_instructions
23 tracksRegLiveness: true
24 registers:
25 - { id: 0, class: vreg_64 }
26 - { id: 1, class: vreg_64 }
27 - { id: 2, class: sreg_64 }
28 - { id: 3, class: vgpr_32 }
29 - { id: 4, class: sreg_32_xm0 }
30 - { id: 5, class: sreg_32_xm0 }
31 - { id: 6, class: sreg_32_xm0 }
32 - { id: 7, class: sreg_32_xm0 }
33 - { id: 8, class: sreg_32 }
34 - { id: 9, class: vgpr_32 }
35 - { id: 10, class: vgpr_32 }
36 - { id: 11, class: vgpr_32 }
37 - { id: 12, class: vgpr_32 }
38 - { id: 13, class: vgpr_32 }
39 - { id: 14, class: vgpr_32 }
40 - { id: 15, class: vgpr_32 }
41 - { id: 16, class: vgpr_32 }
42 - { id: 17, class: vgpr_32 }
43 - { id: 18, class: vgpr_32 }
44 - { id: 19, class: vgpr_32 }
45 - { id: 20, class: vgpr_32 }
46 - { id: 21, class: vgpr_32 }
47 - { id: 22, class: vgpr_32 }
48 - { id: 23, class: vgpr_32 }
49 - { id: 24, class: vgpr_32 }
50 - { id: 25, class: vgpr_32 }
51 - { id: 26, class: vgpr_32 }
52 - { id: 27, class: vgpr_32 }
53 - { id: 28, class: vgpr_32 }
54 - { id: 29, class: vgpr_32 }
55 - { id: 30, class: vgpr_32 }
56 - { id: 31, class: vgpr_32 }
57 - { id: 32, class: vgpr_32 }
58 - { id: 33, class: vgpr_32 }
59 - { id: 34, class: vgpr_32 }
60 - { id: 35, class: vgpr_32 }
61 - { id: 36, class: vgpr_32 }
62 - { id: 37, class: vgpr_32 }
63 - { id: 38, class: vgpr_32 }
64 - { id: 39, class: vgpr_32 }
65 - { id: 40, class: vgpr_32 }
66 - { id: 41, class: vgpr_32 }
67 - { id: 42, class: vgpr_32 }
68 - { id: 43, class: vgpr_32 }
69 - { id: 44, class: vgpr_32 }
70 - { id: 45, class: vgpr_32 }
71 - { id: 46, class: vgpr_32 }
72 - { id: 47, class: vgpr_32 }
73 - { id: 48, class: vgpr_32 }
74 - { id: 100, class: vgpr_32 }
75 body: |
76 bb.0:
77 liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
78
79 %2 = COPY $sgpr30_sgpr31
80 %1 = COPY $vgpr2_vgpr3
81 %0 = COPY $vgpr0_vgpr1
82 %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
83
84 %5 = S_MOV_B32 65535
85 %6 = S_MOV_B32 65535
86
87 %10 = V_LSHRREV_B32_e64 16, %3, implicit $exec
88 %11 = V_MOV_B32_e32 %10, implicit $exec
89 %12 = V_LSHLREV_B32_e64 16, %11, implicit $exec
90 %14 = V_FRACT_F32_e32 123, implicit $exec
91 %15 = V_LSHLREV_B32_e64 16, %14, implicit $exec
92 %16 = V_LSHRREV_B32_e64 16, %15, implicit $exec
93 %17 = V_SIN_F32_e32 %16, implicit $exec
94 %18 = V_LSHLREV_B32_e64 16, %17, implicit $exec
95 %19 = V_LSHRREV_B32_e64 16, %18, implicit $exec
96 %20 = V_CVT_U32_F32_e32 %19, implicit $exec
97 %21 = V_LSHLREV_B32_e64 16, %20, implicit $exec
98 %23 = V_CVT_F32_I32_e32 123, implicit $exec
99 %24 = V_LSHLREV_B32_e64 16, %23, implicit $exec
100
101 %25 = V_LSHRREV_B32_e64 16, %3, implicit $exec
102 %26 = V_MOV_B32_e64 %25, implicit $exec
103 %26 = V_LSHLREV_B32_e64 16, %26, implicit $exec
104 %27 = V_FRACT_F32_e64 0, %6, 0, 0, implicit $exec
105 %28 = V_LSHLREV_B32_e64 16, %27, implicit $exec
106 %29 = V_LSHRREV_B32_e64 16, %28, implicit $exec
107 %30 = V_SIN_F32_e64 0, %29, 0, 0, implicit $exec
108 %31 = V_LSHLREV_B32_e64 16, %30, implicit $exec
109 %32 = V_LSHRREV_B32_e64 16, %31, implicit $exec
110 %33 = V_CVT_U32_F32_e64 0, %32, 0, 0, implicit $exec
111 %34 = V_LSHLREV_B32_e64 16, %33, implicit $exec
112 %35 = V_CVT_F32_I32_e64 %6, 0, 0, implicit $exec
113 %36 = V_LSHLREV_B32_e64 16, %35, implicit $exec
114
115
116 %37 = V_LSHRREV_B32_e64 16, %36, implicit $exec
117 %38 = V_FRACT_F32_e64 1, %37, 0, 0, implicit $exec
118 %39 = V_LSHLREV_B32_e64 16, %38, implicit $exec
119 %40 = V_LSHRREV_B32_e64 16, %39, implicit $exec
120 %41 = V_SIN_F32_e64 0, %40, 1, 0, implicit $exec
121 %42 = V_LSHLREV_B32_e64 16, %41, implicit $exec
122 %43 = V_LSHRREV_B32_e64 16, %42, implicit $exec
123 %44 = V_CVT_U32_F32_e64 1, %43, 0, 0, implicit $exec
124 %45 = V_LSHLREV_B32_e64 16, %44, implicit $exec
125 %46 = V_LSHRREV_B32_e64 16, %45, implicit $exec
126 %47 = V_CVT_F32_I32_e64 %46, 0, 1, implicit $exec
127 %48 = V_LSHLREV_B32_e64 16, %47, implicit $exec
128
129
130 %100 = V_MOV_B32_e32 %48, implicit $exec
131
132 FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
133 $sgpr30_sgpr31 = COPY %2
134 S_SETPC_B64_return $sgpr30_sgpr31
135
136 ...
137 ---
138 # GCN-LABEL: {{^}}name: vop2_instructions
139
140 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec
141 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec
142 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $exec
143 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $exec
144 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F16_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $exec
145
146 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec
147 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec
148 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $exec
149 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e64 0, 23, 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, implicit $exec
150 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F16_e64 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, implicit $exec
151
152 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec
153 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $exec
154 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e64 1, 23, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, 0, implicit $exec
155 # GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F16_e64 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 2, implicit $exec
156
157 name: vop2_instructions
158 tracksRegLiveness: true
159 registers:
160 - { id: 0, class: vreg_64 }
161 - { id: 1, class: vreg_64 }
162 - { id: 2, class: sreg_64 }
163 - { id: 3, class: vgpr_32 }
164 - { id: 4, class: sreg_32_xm0 }
165 - { id: 5, class: sreg_32_xm0 }
166 - { id: 6, class: sreg_32_xm0 }
167 - { id: 7, class: sreg_32_xm0 }
168 - { id: 8, class: sreg_32 }
169 - { id: 9, class: vgpr_32 }
170 - { id: 10, class: vgpr_32 }
171 - { id: 11, class: vgpr_32 }
172 - { id: 12, class: vgpr_32 }
173 - { id: 13, class: vgpr_32 }
174 - { id: 14, class: vgpr_32 }
175 - { id: 15, class: vgpr_32 }
176 - { id: 16, class: vgpr_32 }
177 - { id: 17, class: vgpr_32 }
178 - { id: 18, class: vgpr_32 }
179 - { id: 19, class: vgpr_32 }
180 - { id: 20, class: vgpr_32 }
181 - { id: 21, class: vgpr_32 }
182 - { id: 22, class: vgpr_32 }
183 - { id: 23, class: vgpr_32 }
184 - { id: 24, class: vgpr_32 }
185 - { id: 25, class: vgpr_32 }
186 - { id: 26, class: vgpr_32 }
187 - { id: 27, class: vgpr_32 }
188 - { id: 28, class: vgpr_32 }
189 - { id: 29, class: vgpr_32 }
190 - { id: 30, class: vgpr_32 }
191 - { id: 31, class: vgpr_32 }
192 - { id: 32, class: vgpr_32 }
193 - { id: 33, class: vgpr_32 }
194 - { id: 34, class: vgpr_32 }
195 - { id: 35, class: vgpr_32 }
196 - { id: 36, class: vgpr_32 }
197 - { id: 37, class: vgpr_32 }
198 - { id: 38, class: vgpr_32 }
199 - { id: 39, class: vgpr_32 }
200 - { id: 40, class: vgpr_32 }
201 - { id: 41, class: vgpr_32 }
202 - { id: 42, class: vgpr_32 }
203 - { id: 43, class: vgpr_32 }
204 - { id: 44, class: vgpr_32 }
205 - { id: 45, class: vgpr_32 }
206 - { id: 46, class: vgpr_32 }
207 - { id: 47, class: vgpr_32 }
208 - { id: 48, class: vgpr_32 }
209 - { id: 49, class: vgpr_32 }
210 - { id: 50, class: vgpr_32 }
211 - { id: 51, class: vgpr_32 }
212 - { id: 52, class: vgpr_32 }
213 - { id: 53, class: vgpr_32 }
214 - { id: 54, class: vgpr_32 }
215 - { id: 55, class: vgpr_32 }
216 - { id: 56, class: vgpr_32 }
217 - { id: 57, class: vgpr_32 }
218 - { id: 58, class: vgpr_32 }
219 - { id: 59, class: vgpr_32 }
220 - { id: 60, class: vgpr_32 }
221 - { id: 100, class: vgpr_32 }
222 body: |
223 bb.0:
224 liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
225
226 %2 = COPY $sgpr30_sgpr31
227 %1 = COPY $vgpr2_vgpr3
228 %0 = COPY $vgpr0_vgpr1
229 %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
230
231 %5 = S_MOV_B32 65535
232 %6 = S_MOV_B32 65535
233
234 %11 = V_LSHRREV_B32_e64 16, %3, implicit $exec
235 %12 = V_AND_B32_e32 %6, %11, implicit $exec
236 %13 = V_LSHLREV_B32_e64 16, %12, implicit $exec
237 %14 = V_LSHRREV_B32_e64 16, %13, implicit $exec
238 %15 = V_BFE_U32 %13, 8, 8, implicit $exec
239 %16 = V_ADD_F32_e32 %14, %15, implicit $exec
240 %17 = V_LSHLREV_B32_e64 16, %16, implicit $exec
241 %18 = V_LSHRREV_B32_e64 16, %17, implicit $exec
242 %19 = V_BFE_U32 %17, 8, 8, implicit $exec
243 %20 = V_SUB_F16_e32 %18, %19, implicit $exec
244 %21 = V_LSHLREV_B32_e64 16, %20, implicit $exec
245 %22 = V_BFE_U32 %20, 8, 8, implicit $exec
246 %23 = V_FMAC_F32_e32 %21, %22, %22, implicit $exec
247 %24 = V_LSHLREV_B32_e64 16, %23, implicit $exec
248 %25 = V_LSHRREV_B32_e64 16, %24, implicit $exec
249 %26 = V_BFE_U32 %24, 8, 8, implicit $exec
250 %27 = V_FMAC_F16_e32 %25, %26, %26, implicit $exec
251 %28 = V_LSHLREV_B32_e64 16, %27, implicit $exec
252
253 %29 = V_LSHRREV_B32_e64 16, %28, implicit $exec
254 %30 = V_AND_B32_e64 23, %29, implicit $exec
255 %31 = V_LSHLREV_B32_e64 16, %30, implicit $exec
256 %32 = V_LSHRREV_B32_e64 16, %31, implicit $exec
257 %33 = V_BFE_U32 %31, 8, 8, implicit $exec
258 %34 = V_ADD_F32_e64 0, %32, 0, %33, 0, 0, implicit $exec
259 %35 = V_LSHLREV_B32_e64 16, %34, implicit $exec
260 %37 = V_BFE_U32 %35, 8, 8, implicit $exec
261 %38 = V_SUB_F16_e64 0, 23, 0, %37, 0, 0, implicit $exec
262 %39 = V_LSHLREV_B32_e64 16, %38, implicit $exec
263 %40 = V_BFE_U32 %39, 8, 8, implicit $exec
264 %41 = V_FMAC_F32_e64 0, 23, 0, %40, 0, %40, 0, 0, implicit $exec
265 %42 = V_LSHLREV_B32_e64 16, %41, implicit $exec
266 %43 = V_LSHRREV_B32_e64 16, %42, implicit $exec
267 %44 = V_BFE_U32 %42, 8, 8, implicit $exec
268 %45 = V_FMAC_F16_e64 0, %43, 0, %44, 0, %44, 0, 0, implicit $exec
269 %46 = V_LSHLREV_B32_e64 16, %45, implicit $exec
270
271 %47 = V_LSHRREV_B32_e64 16, %46, implicit $exec
272 %48 = V_BFE_U32 %46, 8, 8, implicit $exec
273 %49 = V_ADD_F32_e64 0, %47, 1, %48, 0, 0, implicit $exec
274 %50 = V_LSHLREV_B32_e64 16, %49, implicit $exec
275 %51 = V_BFE_U32 %50, 8, 8, implicit $exec
276 %52 = V_SUB_F16_e64 1, 23, 1, %51, 0, 0, implicit $exec
277 %53 = V_LSHLREV_B32_e64 16, %52, implicit $exec
278 %54 = V_BFE_U32 %53, 8, 8, implicit $exec
279 %55 = V_FMAC_F32_e64 1, 23, 1, %54, 1, %54, 1, 0, implicit $exec
280 %56 = V_LSHLREV_B32_e64 16, %55, implicit $exec
281 %57 = V_LSHRREV_B32_e64 16, %56, implicit $exec
282 %58 = V_BFE_U32 %56, 8, 8, implicit $exec
283 %59 = V_FMAC_F16_e64 1, %57, 1, %58, 1, %58, 0, 2, implicit $exec
284 %60 = V_LSHLREV_B32_e64 16, %59, implicit $exec
285
286 %100 = V_MOV_B32_e32 %60, implicit $exec
287
288 FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
289 $sgpr30_sgpr31 = COPY %2
290 S_SETPC_B64_return $sgpr30_sgpr31
291
292 ...
0 # RUN: llc -march=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s
1
2 # GCN-LABEL: name: test_fmamk_reg_imm_f32
3 # GCN: V_FMAMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $exec
4 ---
5 name: test_fmamk_reg_imm_f32
6 registers:
7 - { id: 0, class: vreg_64 }
8 - { id: 1, class: vgpr_32 }
9 - { id: 2, class: vgpr_32 }
10 - { id: 3, class: vgpr_32 }
11 body: |
12 bb.0:
13
14 %0 = IMPLICIT_DEF
15 %1 = COPY %0.sub1
16 %2 = V_MOV_B32_e32 1078523331, implicit $exec
17 %3 = V_FMAC_F32_e32 killed %0.sub0, %2, killed %1, implicit $exec
18
19 ...
20
21 # GCN-LABEL: name: test_fmamk_imm_reg_f32
22 # GCN: V_FMAMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $exec
23 ---
24 name: test_fmamk_imm_reg_f32
25 registers:
26 - { id: 0, class: vreg_64 }
27 - { id: 1, class: vgpr_32 }
28 - { id: 2, class: vgpr_32 }
29 - { id: 3, class: vgpr_32 }
30 body: |
31 bb.0:
32
33 %0 = IMPLICIT_DEF
34 %1 = COPY %0.sub1
35 %2 = V_MOV_B32_e32 1078523331, implicit $exec
36 %3 = V_FMAC_F32_e32 %2, killed %0.sub0, killed %1, implicit $exec
37
38 ...
39
40 # GCN-LABEL: name: test_fmaak_f32
41 # GCN: V_FMAAK_F32 killed %0.sub0, %0.sub1, 1078523331, implicit $exec
42 ---
43 name: test_fmaak_f32
44 registers:
45 - { id: 0, class: vreg_64 }
46 - { id: 1, class: vgpr_32 }
47 - { id: 2, class: vgpr_32 }
48 body: |
49 bb.0:
50
51 %0 = IMPLICIT_DEF
52 %1 = V_MOV_B32_e32 1078523331, implicit $exec
53 %2 = V_FMAC_F32_e32 killed %0.sub0, %0.sub1, %1, implicit $exec
54
55 ...
56
57 # GCN-LABEL: name: test_fmamk_reg_imm_f16
58 # GCN: V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $exec
59 ---
60 name: test_fmamk_reg_imm_f16
61 registers:
62 - { id: 0, class: vreg_64 }
63 - { id: 1, class: vgpr_32 }
64 - { id: 2, class: vgpr_32 }
65 - { id: 3, class: vgpr_32 }
66 body: |
67 bb.0:
68
69 %0 = IMPLICIT_DEF
70 %1 = COPY %0.sub1
71 %2 = V_MOV_B32_e32 1078523331, implicit $exec
72 %3 = V_FMAC_F16_e32 killed %0.sub0, %2, killed %1, implicit $exec
73
74 ...
75
76 # GCN-LABEL: name: test_fmamk_imm_reg_f16
77 # GCN: V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $exec
78 ---
79 name: test_fmamk_imm_reg_f16
80 registers:
81 - { id: 0, class: vreg_64 }
82 - { id: 1, class: vgpr_32 }
83 - { id: 2, class: vgpr_32 }
84 - { id: 3, class: vgpr_32 }
85 body: |
86 bb.0:
87
88 %0 = IMPLICIT_DEF
89 %1 = COPY %0.sub1
90 %2 = V_MOV_B32_e32 1078523331, implicit $exec
91 %3 = V_FMAC_F16_e32 %2, killed %0.sub0, killed %1, implicit $exec
92
93 ...
94
95 # GCN-LABEL: name: test_fmaak_f16
96 # GCN: V_FMAAK_F16 killed %0.sub0, %0.sub1, 1078523331, implicit $exec
97 ---
98 name: test_fmaak_f16
99 registers:
100 - { id: 0, class: vreg_64 }
101 - { id: 1, class: vgpr_32 }
102 - { id: 2, class: vgpr_32 }
103 body: |
104 bb.0:
105
106 %0 = IMPLICIT_DEF
107 %1 = V_MOV_B32_e32 1078523331, implicit $exec
108 %2 = V_FMAC_F16_e32 killed %0.sub0, %0.sub1, %1, implicit $exec
109 ...
110
111 # GCN-LABEL: name: test_fmaak_sgpr_src0_f32
112 # GCN: %2:vgpr_32 = V_FMAMK_F32 killed %0, 1078523331, %3:vgpr_32, implicit $exec
113
114 ---
115 name: test_fmaak_sgpr_src0_f32
116 registers:
117 - { id: 0, class: sreg_32_xm0 }
118 - { id: 1, class: vgpr_32}
119 - { id: 2, class: vgpr_32 }
120 - { id: 3, class: vgpr_32 }
121 body: |
122 bb.0:
123
124 %0 = IMPLICIT_DEF
125 %1 = V_MOV_B32_e32 1078523331, implicit $exec
126 %2 = V_FMAC_F32_e32 killed %0, %1, %3, implicit $exec
127
128 ...
129
130 # GCN-LABEL: name: test_fmaak_inlineimm_src0_f32
131 # GCN: %1:vgpr_32 = V_FMAMK_F32 1073741824, 1078523331, %2:vgpr_32, implicit $exec
132
133 ---
134 name: test_fmaak_inlineimm_src0_f32
135 registers:
136 - { id: 0, class: vgpr_32}
137 - { id: 1, class: vgpr_32 }
138 - { id: 2, class: vgpr_32 }
139 body: |
140 bb.0:
141
142 %0 = V_MOV_B32_e32 1078523331, implicit $exec
143 %1 = V_FMAC_F32_e32 1073741824, %0, %2, implicit $exec
144
145 ...
146
147 # GCN-LABEL: name: test_fmaak_otherimm_src0_f32
148 # GCN: %1:vgpr_32 = V_FMAC_F32_e32 1120403456, %0, %1, implicit $exec
149
150 ---
151 name: test_fmaak_otherimm_src0_f32
152 registers:
153 - { id: 0, class: vgpr_32}
154 - { id: 1, class: vgpr_32 }
155 - { id: 2, class: vgpr_32 }
156 body: |
157 bb.0:
158
159 %0 = V_MOV_B32_e32 1078523331, implicit $exec
160 %1 = V_FMAC_F32_e32 1120403456, %0, %2, implicit $exec
161
162 ...
163
164 # GCN-LABEL: name: test_fmaak_other_constantlike_src0_f32
165 # GCN: %1:vgpr_32 = V_FMAC_F32_e32 %stack.0, %0, %1, implicit $exec
166 ---
167 name: test_fmaak_other_constantlike_src0_f32
168 registers:
169 - { id: 0, class: vgpr_32}
170 - { id: 1, class: vgpr_32 }
171 - { id: 2, class: vgpr_32 }
172 stack:
173 - { id: 0, name: "", type: default, offset: 0, size: 128, alignment: 8,
174 callee-saved-register: '', local-offset: 0, debug-info-variable: '',
175 debug-info-expression: '', debug-info-location: '' }
176 body: |
177 bb.0:
178
179 %0 = V_MOV_B32_e32 1078523331, implicit $exec
180 %1 = V_FMAC_F32_e32 %stack.0, %0, %2, implicit $exec
181
182 ...