llvm.org GIT mirror llvm / 110a2bc
[AMDGPU] Untangle SDWA pass from SIShrinkInstructions Remove dependency of SDWA pass on SIShrinkInstructions. The goal is to move SDWA even higher in the stack to avoid second run of MachineLICM, MachineCSE and SIFoldOperands. Also added handling to preserve original src modifiers. Differential Revision: https://reviews.llvm.org/D33860 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@304665 91177308-0d34-0410-b5e6-96231b3b80d8 Stanislav Mekhanoshin 2 years ago
24 changed file(s) with 173 addition(s) and 125 deletion(s). Raw diff Collapse all Expand all
733733 addPass(&SIFoldOperandsID);
734734 addPass(&DeadMachineInstructionElimID);
735735 addPass(&SILoadStoreOptimizerID);
736 addPass(createSIShrinkInstructionsPass());
737736 if (EnableSDWAPeephole) {
738737 addPass(&SIPeepholeSDWAID);
739738 addPass(&MachineLICMID);
741740 addPass(&SIFoldOperandsID);
742741 addPass(&DeadMachineInstructionElimID);
743742 }
743 addPass(createSIShrinkInstructionsPass());
744744 }
745745
746746 bool GCNPassConfig::addILPOpts() {
128128 bool getNeg() const { return Neg; }
129129 bool getSext() const { return Sext; }
130130
131 uint64_t getSrcMods() const;
131 uint64_t getSrcMods(const SIInstrInfo *TII,
132 const MachineOperand *SrcOp) const;
132133 };
133134
134135 class SDWADstOperand : public SDWAOperand {
239240 return SuperMask.all();
240241 }
241242
242 uint64_t SDWASrcOperand::getSrcMods() const {
243 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
244 const MachineOperand *SrcOp) const {
243245 uint64_t Mods = 0;
246 const auto *MI = SrcOp->getParent();
247 if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
248 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
249 Mods = Mod->getImm();
250 }
251 } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
252 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
253 Mods = Mod->getImm();
254 }
255 }
244256 if (Abs || Neg) {
245257 assert(!Sext &&
246258 "Float and integer src modifiers can't be set simulteniously");
247259 Mods |= Abs ? SISrcMods::ABS : 0;
248 Mods |= Neg ? SISrcMods::NEG : 0;
260 Mods ^= Neg ? SISrcMods::NEG : 0;
249261 } else if (Sext) {
250262 Mods |= SISrcMods::SEXT;
251263 }
311323 }
312324 copyRegOperand(*Src, *getTargetOperand());
313325 SrcSel->setImm(getSrcSel());
314 SrcMods->setImm(getSrcMods());
326 SrcMods->setImm(getSrcMods(TII, Src));
315327 getTargetOperand()->setIsKill(false);
316328 return true;
317329 }
408420 switch (Opcode) {
409421 case AMDGPU::V_LSHRREV_B32_e32:
410422 case AMDGPU::V_ASHRREV_I32_e32:
411 case AMDGPU::V_LSHLREV_B32_e32: {
423 case AMDGPU::V_LSHLREV_B32_e32:
424 case AMDGPU::V_LSHRREV_B32_e64:
425 case AMDGPU::V_ASHRREV_I32_e64:
426 case AMDGPU::V_LSHLREV_B32_e64: {
412427 // from: v_lshrrev_b32_e32 v1, 16/24, v0
413428 // to SDWA src:v0 src_sel:WORD_1/BYTE_3
414429
431446 TRI->isPhysicalRegister(Dst->getReg()))
432447 break;
433448
434 if (Opcode == AMDGPU::V_LSHLREV_B32_e32) {
449 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
450 Opcode == AMDGPU::V_LSHLREV_B32_e64) {
435451 auto SDWADst = make_unique(
436452 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
437453 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
440456 } else {
441457 auto SDWASrc = make_unique(
442458 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
443 Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true);
459 Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
460 Opcode != AMDGPU::V_LSHRREV_B32_e64);
444461 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
445462 SDWAOperands[&MI] = std::move(SDWASrc);
446463 ++NumSDWAPatternsFound;
450467
451468 case AMDGPU::V_LSHRREV_B16_e32:
452469 case AMDGPU::V_ASHRREV_I16_e32:
453 case AMDGPU::V_LSHLREV_B16_e32: {
470 case AMDGPU::V_LSHLREV_B16_e32:
471 case AMDGPU::V_LSHRREV_B16_e64:
472 case AMDGPU::V_ASHRREV_I16_e64:
473 case AMDGPU::V_LSHLREV_B16_e64: {
454474 // from: v_lshrrev_b16_e32 v1, 8, v0
455475 // to SDWA src:v0 src_sel:BYTE_1
456476
471491 TRI->isPhysicalRegister(Dst->getReg()))
472492 break;
473493
474 if (Opcode == AMDGPU::V_LSHLREV_B16_e32) {
494 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
495 Opcode == AMDGPU::V_LSHLREV_B16_e64) {
475496 auto SDWADst =
476497 make_unique(Dst, Src1, BYTE_1, UNUSED_PAD);
477498 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
480501 } else {
481502 auto SDWASrc = make_unique(
482503 Src1, Dst, BYTE_1, false, false,
483 Opcode == AMDGPU::V_LSHRREV_B16_e32 ? false : true);
504 Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
505 Opcode != AMDGPU::V_LSHRREV_B16_e64);
484506 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
485507 SDWAOperands[&MI] = std::move(SDWASrc);
486508 ++NumSDWAPatternsFound;
548570 ++NumSDWAPatternsFound;
549571 break;
550572 }
551 case AMDGPU::V_AND_B32_e32: {
573 case AMDGPU::V_AND_B32_e32:
574 case AMDGPU::V_AND_B32_e64: {
552575 // e.g.:
553576 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
554577 // to SDWA src:v0 src_sel:WORD_0/BYTE_0
555578
556579 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
580 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
581 auto ValSrc = Src1;
557582 auto Imm = foldToImm(*Src0);
558 if (!Imm)
559 break;
560
561 if (*Imm != 0x0000ffff && *Imm != 0x000000ff)
562 break;
563
564 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
583
584 if (!Imm) {
585 Imm = foldToImm(*Src1);
586 ValSrc = Src0;
587 }
588
589 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
590 break;
591
565592 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
566593
567594 if (TRI->isPhysicalRegister(Src1->getReg()) ||
569596 break;
570597
571598 auto SDWASrc = make_unique(
572 Src1, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
599 ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
573600 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
574601 SDWAOperands[&MI] = std::move(SDWASrc);
575602 ++NumSDWAPatternsFound;
582609
583610 bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const {
584611 // Check if this instruction has opcode that supports SDWA
585 return AMDGPU::getSDWAOp(MI.getOpcode()) != -1;
612 unsigned Opc = MI.getOpcode();
613 if (AMDGPU::getSDWAOp(Opc) != -1)
614 return true;
615 int Opc32 = AMDGPU::getVOPe32(Opc);
616 if (Opc32 != -1 && AMDGPU::getSDWAOp(Opc32) != -1)
617 return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) &&
618 !TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
619 return false;
586620 }
587621
588622 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
589623 const SDWAOperandsVector &SDWAOperands) {
590624 // Convert to sdwa
591625 int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());
626 if (SDWAOpcode == -1)
627 SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(MI.getOpcode()));
592628 assert(SDWAOpcode != -1);
629
630 // Copy dst, if it is present in original then should also be present in SDWA
631 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
632 if (!Dst && !TII->isVOPC(MI))
633 return false;
593634
594635 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
595636
597638 MachineInstrBuilder SDWAInst =
598639 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
599640
600 // Copy dst, if it is present in original then should also be present in SDWA
601 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
602641 if (Dst) {
603642 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
604643 SDWAInst.add(*Dst);
605 } else {
606 assert(TII->isVOPC(MI));
607644 }
608645
609646 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
613650 Src0 &&
614651 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
615652 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
616 SDWAInst.addImm(0);
653 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
654 SDWAInst.addImm(Mod->getImm());
655 else
656 SDWAInst.addImm(0);
617657 SDWAInst.add(*Src0);
618658
619659 // Copy src1 if present, initialize src1_modifiers.
622662 assert(
623663 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
624664 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
625 SDWAInst.addImm(0);
665 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
666 SDWAInst.addImm(Mod->getImm());
667 else
668 SDWAInst.addImm(0);
626669 SDWAInst.add(*Src1);
627 } else {
628 assert(TII->isVOP1(MI));
629670 }
630671
631672 if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
6565
6666 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
6767 ; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0x1c8
68 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v[[SCONST]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
68 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
6969 define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
7070 %tid = call i32 @llvm.amdgcn.workitem.id.x()
7171 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
8383
8484 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}}
8585 ; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0xfffffc21
86 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v[[SCONST]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
86 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
8787 define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
8888 %tid = call i32 @llvm.amdgcn.workitem.id.x()
8989 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
100100 ; VI: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1
101101 ; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
102102 ; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
103 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v[[SCONST]], [[LOAD0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
103 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD0]], v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
104104 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]]
105105 ; VI: v_or_b32_e32
106106 define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
139139
140140 ; VI-NOT: v_add_u16
141141 ; VI: v_mov_b32_e32 v[[K:[0-9]+]], 0x3f80
142 ; VI: v_add_u16_sdwa v{{[0-9]+}}, v[[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
142 ; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
143143 ; VI-NOT: v_add_u16
144144 ; VI: v_or_b32_e32
145145 define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
88 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
99
1010 ; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
11 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
11 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1212
1313 ; CI: v_ashrrev_i32_e32
1414 ; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
3939 ; VI: flat_load_ushort [[LO:v[0-9]+]]
4040 ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
4141 ; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[HI]]
42 ; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[MASK]], [[LO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
42 ; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[LO]], [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4343 ; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_HI]], [[FABS_LO]]
4444 ; VI: flat_store_dword
4545
5959 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
6060
6161 ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
62 ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
63 ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
62 ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
63 ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
6464 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
6565 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
6666 ; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
127127 ; CI: v_cvt_f16_f32
128128
129129 ; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
130 ; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
130 ; VI: v_mul_f16_sdwa v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
131131 ; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
132132
133133 ; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]]
7777 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
7878
7979 ; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
80 ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
80 ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
8181 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
8282
8383 ; GCN: buffer_store_dword v[[R_V2_F16]]
107107 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
108108
109109 ; VI-DAG: v_mov_b32_e32 v[[CONST2:[0-9]+]], 0x4000
110 ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[CONST2]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
110 ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
111111 ; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
112112 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
113113
136136 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
137137
138138 ; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
139 ; VI-DAG: v_add_f16_sdwa v[[R_F16_0:[0-9]+]], v[[CONST1]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
139 ; VI-DAG: v_add_f16_sdwa v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
140140 ; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[A_V2_F16]]
141141 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
142142
277277 }
278278
279279 ; GCN-LABEL: {{^}}s_test_canonicalize_var_v2f16:
280 ; VI: v_mul_f16_e64 [[REG0:v[0-9]+]], 1.0, {{s[0-9]+}}
281 ; VI-DAG: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}}
282 ; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
280 ; VI: v_mov_b32_e32 [[ONE:v[0-9]+]], 0x3c00
281 ; VI: v_mul_f16_sdwa [[REG0:v[0-9]+]], [[ONE]], {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
282 ; VI: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}}
283283 ; VI-NOT: v_and_b32
284284
285285 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{s[0-9]+$}}
7777 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
7878
7979 ; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
80 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
80 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
8181 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
8282
8383 ; GCN: buffer_store_dword v[[R_V2_F16]]
104104 ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
105105 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
106106 ; VI-DAG: v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400
107 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[CONST4]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
107 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
108108 ; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
109109 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
110110 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
130130 ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
131131 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
132132 ; VI-DAG: v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200
133 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[CONST3]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
133 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
134134 ; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
135135 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
136136 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
7272 ; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
7373 ; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
7474 ; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
75 ; VI: v_or_b32_sdwa v{{[0-9]+}}, [[VMASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
75 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
7676 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
7777 ; CIVI: flat_store_dword
7878
9191 ; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
9292 ; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
9393 ; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
94 ; VI: v_or_b32_sdwa v{{[0-9]+}}, [[VMASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
94 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9595 ; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
96 ; VI: v_or_b32_sdwa v{{[0-9]+}}, [[VMASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
96 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
9797 ; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
9898
9999 ; GFX9: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
115115 ; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
116116
117117 ; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0
118 ; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0
118 ; VI: v_mul_f16_sdwa v{{[0-9]+}}, -|v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
119119
120120 ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
121121 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0]
116116 ; CI: v_cvt_f16_f32
117117
118118 ; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
119 ; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
119 ; VI: v_mul_f16_sdwa v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
120120 ; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
121121
122122 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,0] neg_hi:[1,0]{{$}}
6565 ; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
6666 ; VI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
6767 ; VI: v_cvt_i32_f32_sdwa v[[R_I16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
68 ; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_1]], v[[R_I16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
68 ; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
6969
7070 ; GCN: buffer_store_dword v[[R_V2_I16]]
7171 ; GCN: s_endpgm
6565 ; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
6666 ; VI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
6767 ; VI: v_cvt_i32_f32_sdwa v[[R_I16_0:[0-9]+]], v[[A_F32_0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
68 ; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
68 ; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_1]], v[[R_I16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
6969
7070 ; GCN: buffer_store_dword v[[R_V2_I16]]
7171 ; GCN: s_endpgm
7777 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
7878
7979 ; VI-DAG: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
80 ; VI-DAG: v_subrev_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
80 ; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
8181 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
8282
8383 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
145145 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
146146
147147 ; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00
148 ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONSTM1]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
148 ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
149149 ; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
150150 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
151151
123123 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
124124 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL0]]
125125 ; VI-DAG: v_mov_b32_e32 [[CONST0:v[0-9]+]], 0
126 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST0]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
126 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
127127 ; VI: v_or_b32
128128 ; VI: buffer_store_dword
129129 define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
141141 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
142142 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL0]]
143143 ; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
144 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST05]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
144 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
145145 ; VI: v_or_b32
146146 ; VI: buffer_store_dword
147147 define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
159159 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
160160 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL0]]
161161 ; VI-DAG: v_mov_b32_e32 [[CONSTM05:v[0-9]+]], 0xb800
162 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM05]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
162 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
163163 ; VI: v_or_b32
164164 ; VI: buffer_store_dword
165165 define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
177177 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
178178 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL0]]
179179 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0x3c00
180 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
180 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
181181 ; VI: v_or_b32
182182 ; VI: buffer_store_dword
183183 define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
195195 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
196196 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL0]]
197197 ; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00
198 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
198 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
199199 ; VI: v_or_b32
200200 ; VI: buffer_store_dword
201201 define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
213213 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
214214 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL0]]
215215 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
216 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
216 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
217217 ; VI: v_or_b32
218218 ; VI: buffer_store_dword
219219 define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
231231 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
232232 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL0]]
233233 ; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xc000
234 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
234 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
235235 ; VI: v_or_b32
236236 ; VI: buffer_store_dword
237237 define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
249249 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
250250 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL0]]
251251 ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
252 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST4]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
252 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
253253 ; VI: v_or_b32
254254 ; VI: buffer_store_dword
255255 define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
267267 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
268268 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL0]]
269269 ; VI-DAG: v_mov_b32_e32 [[CONSTM4:v[0-9]+]], 0xc400
270 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM4]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
270 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
271271 ; VI: v_or_b32
272272 ; VI: buffer_store_dword
273273 define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
284284 ; VI: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
285285 ; VI: buffer_load_dword
286286 ; VI-NOT: and
287 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST05]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
287 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
288288 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
289289 ; VI: v_or_b32
290290 ; VI: buffer_store_dword
305305 ; VI-DAG: buffer_load_dword
306306 ; VI-NOT: and
307307 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
308 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
308 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
309309 ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
310310 ; VI: buffer_store_dword
311311 define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
324324 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
325325 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL0]]
326326 ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1
327 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
327 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
328328 ; VI: v_or_b32
329329 ; VI: buffer_store_dword
330330 define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
342342 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
343343 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL0]]
344344 ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2
345 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
345 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
346346 ; VI: v_or_b32
347347 ; VI: buffer_store_dword
348348 define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
360360 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
361361 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL0]]
362362 ; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16
363 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST16]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
363 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
364364 ; VI: v_or_b32
365365 ; VI: buffer_store_dword
366366 define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
378378 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
379379 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL0]]
380380 ; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xffff
381 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
381 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
382382 ; VI: v_or_b32
383383 ; VI: buffer_store_dword
384384 define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
396396 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
397397 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL0]]
398398 ; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xfffe
399 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
399 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
400400 ; VI: v_or_b32
401401 ; VI: buffer_store_dword
402402 define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
414414 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
415415 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL0]]
416416 ; VI-DAG: v_mov_b32_e32 [[CONSTM16:v[0-9]+]], 0xfff0
417 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM16]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
417 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
418418 ; VI: v_or_b32
419419 ; VI: buffer_store_dword
420420 define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
432432 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
433433 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL0]]
434434 ; VI-DAG: v_mov_b32_e32 [[CONST63:v[0-9]+]], 63
435 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST63]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
435 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
436436 ; VI: v_or_b32
437437 ; VI: buffer_store_dword
438438 define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
450450 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
451451 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL0]]
452452 ; VI-DAG: v_mov_b32_e32 [[CONST64:v[0-9]+]], 64
453 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST64]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
453 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
454454 ; VI: v_or_b32
455455 ; VI: buffer_store_dword
456456 define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
260260 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e70000
261261 ; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
262262 ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]]
263 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
263 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
264264
265265 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
266266 ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
284284 ; CI: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
285285 ; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
286286 ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]]
287 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
287 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
288288 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], -15, 16, [[ELT0]]
289289 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
290290 define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
344344 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x45000000
345345 ; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
346346 ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]]
347 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
347 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
348348
349349 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4500
350350 ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
368368 ; CI: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
369369 ; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
370370 ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]]
371 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
371 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
372372 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], 35, 16, [[ELT0]]
373373 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
374374 define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
117117 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
118118
119119 ; VI-FLUSH: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
120 ; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[C_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
120 ; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[B_V2_F16]], v[[C_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
121121 ; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]]
122122 ; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]]
123123 ; VI-FLUSH-NOT: v_and_b32
8181 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
8282
8383 ; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
84 ; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
84 ; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
8585 ; VI-NOT: and
8686 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
8787
109109 ; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
110110 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
111111 ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
112 ; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST4]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
112 ; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
113113 ; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
114114
115115 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
137137 ; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
138138 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
139139 ; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
140 ; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST3]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
140 ; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
141141 ; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
142142
143143 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
8080 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
8181
8282 ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
83 ; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
83 ; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
8484 ; VI-NOT: and
8585 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
8686
110110 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
111111
112112 ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
113 ; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST4]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
113 ; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
114114 ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
115115
116116 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
138138 ; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
139139 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
140140 ; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
141 ; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST3]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
141 ; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
142142 ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
143143
144144 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
1111 ; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
1212 ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
1313
14 ; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]]
15 ; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]]
14 ; GCN-DAG: v_mov_b32_e32 [[C200:v[0-9]+]], 0x200
15 ; GCN-DAG: v_mov_b32_e32 [[C400:v[0-9]+]], 0x400
16 ; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[C200]], [[CLAMP_IDX]]
17 ; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[C400]], [[CLAMP_IDX]]
1618
1719 ; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
1820 ; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
7373
7474 ; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
7575 ; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
76 ; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
76 ; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
7777
7878 define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
7979 entry:
9696 ; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
9797 ; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
9898 ; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
99 ; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
100 ; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
99 ; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
100 ; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
101101
102102 define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) {
103103 entry:
124124 ; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
125125 ; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
126126 ; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
127 ; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL7]], v[[DST_MUL6]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
128 ; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL5]], v[[DST_MUL4]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
129 ; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
130 ; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
127 ; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL6]], v[[DST_MUL7]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
128 ; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL4]], v[[DST_MUL5]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
129 ; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
130 ; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
131131
132132 define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) {
133133 entry:
346346 ; NOSDWA-NOT: v_mul_u32_u24_sdwa
347347 ; SDWA-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141
348348 ; SDWA-DAG: v_mov_b32_e32 v[[M123:[0-9]+]], 0x7b
349 ; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v[[M123]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
350 ; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v[[M321]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
349 ; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M123]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
350 ; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M321]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
351351
352352 define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
353353 entry:
366366 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
367367 ; NOSDWA-NOT: v_mul_u32_u24_sdwa
368368
369 ; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
369 ; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
370370
371371 define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
372372 entry:
407407 ; NOSDWA-NOT: v_and_b32_sdwa
408408 ; NOSDWA-NOT: v_or_b32_sdwa
409409
410 ; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
410 ; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
411411 ; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
412 ; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
412 ; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
413413 ; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
414414 ; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
415415
99
1010 ; VI: v_lshlrev_b32_e32
1111 ; VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
12 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
12 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1313
1414 ; CI: v_lshlrev_b32_e32
1515 ; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
99
1010 ; VI: v_sub_i32_e32
1111 ; VI-DAG: v_sub_i32_e32
12 ; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
13 ; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
12 ; VI: v_max_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
13 ; VI: v_max_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1414 ; VI: v_add_i32_e32
1515 ; VI: v_add_i32_e32
16 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
16 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1717
1818 ; CI: v_sub_i32_e32
1919 ; CI-DAG: v_sub_i32_e32
4646 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
4747 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
4848 ; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}
49 ; VI: v_add_u16_sdwa v{{[0-9]+}}, [[TWO]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
49 ; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[TWO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5050 ; VI-NOT: v_and_b32
5151 ; VI: v_or_b32_e32
5252 define amdgpu_kernel void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
44 ; GCN-LABEL: {{^}}v_test_sub_v2i16:
55 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
66
7 ; VI: v_subrev_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
7 ; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
88 ; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
99 define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
1010 %tid = call i32 @llvm.amdgcn.workitem.id.x()
6161 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
6262
6363 ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfffffe38
64 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
64 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
6565 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}}
6666 define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
6767 %tid = call i32 @llvm.amdgcn.workitem.id.x()
7979 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
8080
8181 ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3df
82 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}}
82 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
8383 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x34d, v{{[0-9]+}}
8484 define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
8585 %tid = call i32 @llvm.amdgcn.workitem.id.x()
9797 ; VI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
9898 ; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
9999 ; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
100 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[ONE]], [[LOAD0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
100 ; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD0]], [[ONE]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
101101 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD1]]
102102 ; VI: v_or_b32_e32
103103 define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
136136
137137 ; VI-NOT: v_subrev_i16
138138 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xffffc080
139 ; VI: v_add_u16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
139 ; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
140140 ; VI-NOT: v_subrev_i16
141141 ; VI: v_or_b32_e32
142142 define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
251251 ; GFX9: v_pk_sub_i16
252252 ; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
253253
254 ; VI: v_subrev_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
254 ; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
255255 ; VI: v_subrev_u16_e32
256256
257257 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
303303 ; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]]
304304
305305 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
306 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
306 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
307 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
308 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
309 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
307310 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
308 ; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
309 ; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
311 ; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
312 ; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
310313 ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
311 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
312 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
313 ; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
314314 ; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]]
315315 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
316316 ; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]]
319319 ; VI-NOT: and
320320 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
321321
322 ; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
323 ; VI-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[C_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
324 ; VI-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]]
325 ; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]]
322 ; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
323 ; VI-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
324 ; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]]
325 ; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
326326 ; VI-NOT: and
327 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[A_V2_F16]]
327 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[C_V2_F16]]
328328
329329 ; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]]
330330 ; GCN: s_endpgm
335335 <2 x half> addrspace(1)* %c) #0 {
336336 entry:
337337 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
338 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
338 call void @llvm.amdgcn.s.barrier() #2
339 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
340 call void @llvm.amdgcn.s.barrier() #2
339341 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
340342
341343 %t.val = fmul <2 x half> %a.val, %b.val
484486 ; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
485487 ; VI-DAG: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
486488 ; VI-DAG: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
487 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
489 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
488490 ; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
489491
490492 ; GCN: s_endpgm
516518 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
517519 ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
518520 ; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
519 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
521 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
520522 ; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
521523
522524 ; GCN: s_endpgm
669671 ret void
670672 }
671673
674 declare void @llvm.amdgcn.s.barrier() #2
675
672676 attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" }
673677 attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" }
678 attributes #2 = { nounwind convergent }