llvm.org GIT mirror llvm / 710e9b3
AMDGPU: Replace i64 add/sub lowering Use VOP3 add/addc like usual. This has some tradeoffs. Inline immediates fold a little better, but other constants are worse off. SIShrinkInstructions could be made smarter to handle these cases. This allows us to avoid selecting scalar adds where we need to track the carry in scc and replace its users. This makes it easier to use the carryless VALU adds. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@318340 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 2 years ago
17 changed file(s) with 210 addition(s) and 68 deletion(s). Raw diff Collapse all Expand all
458458 // We are selecting i64 ADD here instead of custom lower it during
459459 // DAG legalization, so we can fold some i64 ADDs used for address
460460 // calculation into the LOAD and STORE instructions.
461 case ISD::ADD:
462461 case ISD::ADDC:
463462 case ISD::ADDE:
464 case ISD::SUB:
465463 case ISD::SUBC:
466464 case ISD::SUBE: {
467465 if (N->getValueType(0) != MVT::i64)
26012601 MachineBasicBlock::iterator I(&MI);
26022602
26032603 unsigned DstReg = MI.getOperand(0).getReg();
2604 unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2605 unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2604 unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2605 unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
26062606
26072607 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
26082608
29532953 }
29542954
29552955 switch (MI.getOpcode()) {
2956 case AMDGPU::SI_INIT_M0:
2956 case AMDGPU::S_ADD_U64_PSEUDO:
2957 case AMDGPU::S_SUB_U64_PSEUDO: {
2958 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
2959 const DebugLoc &DL = MI.getDebugLoc();
2960
2961 MachineOperand &Dest = MI.getOperand(0);
2962 MachineOperand &Src0 = MI.getOperand(1);
2963 MachineOperand &Src1 = MI.getOperand(2);
2964
2965 unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
2966 unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
2967
2968 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
2969 Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
2970 &AMDGPU::SReg_32_XM0RegClass);
2971 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
2972 Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
2973 &AMDGPU::SReg_32_XM0RegClass);
2974
2975 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
2976 Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
2977 &AMDGPU::SReg_32_XM0RegClass);
2978 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
2979 Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
2980 &AMDGPU::SReg_32_XM0RegClass);
2981
2982 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
2983
2984 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
2985 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
2986 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
2987 .add(Src0Sub0)
2988 .add(Src1Sub0);
2989 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
2990 .add(Src0Sub1)
2991 .add(Src1Sub1);
2992 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
2993 .addReg(DestSub0)
2994 .addImm(AMDGPU::sub0)
2995 .addReg(DestSub1)
2996 .addImm(AMDGPU::sub1);
2997 MI.eraseFromParent();
2998 return BB;
2999 }
3000 case AMDGPU::SI_INIT_M0: {
29573001 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
29583002 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
29593003 .add(MI.getOperand(0));
29603004 MI.eraseFromParent();
29613005 return BB;
2962
3006 }
29633007 case AMDGPU::SI_INIT_EXEC:
29643008 // This should be before all vector instructions.
29653009 BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
36073607 switch (Opcode) {
36083608 default:
36093609 break;
3610 case AMDGPU::S_ADD_U64_PSEUDO:
3611 case AMDGPU::S_SUB_U64_PSEUDO:
3612 splitScalar64BitAddSub(Worklist, Inst);
3613 Inst.eraseFromParent();
3614 continue;
36103615 case AMDGPU::S_AND_B64:
36113616 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
36123617 Inst.eraseFromParent();
39553960 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
39563961 }
39573962
3963 void SIInstrInfo::splitScalar64BitAddSub(
3964 SetVectorType &Worklist, MachineInstr &Inst) const {
3965 bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3966
3967 MachineBasicBlock &MBB = *Inst.getParent();
3968 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3969
3970 unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
3971 unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3972 unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3973
3974 unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3975 unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3976
3977 MachineOperand &Dest = Inst.getOperand(0);
3978 MachineOperand &Src0 = Inst.getOperand(1);
3979 MachineOperand &Src1 = Inst.getOperand(2);
3980 const DebugLoc &DL = Inst.getDebugLoc();
3981 MachineBasicBlock::iterator MII = Inst;
3982
3983 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
3984 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
3985 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
3986 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
3987
3988 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3989 AMDGPU::sub0, Src0SubRC);
3990 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
3991 AMDGPU::sub0, Src1SubRC);
3992
3993
3994 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3995 AMDGPU::sub1, Src0SubRC);
3996 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
3997 AMDGPU::sub1, Src1SubRC);
3998
3999 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
4000 MachineInstr *LoHalf =
4001 BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
4002 .addReg(CarryReg, RegState::Define)
4003 .add(SrcReg0Sub0)
4004 .add(SrcReg1Sub0);
4005
4006 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4007 MachineInstr *HiHalf =
4008 BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
4009 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4010 .add(SrcReg0Sub1)
4011 .add(SrcReg1Sub1)
4012 .addReg(CarryReg, RegState::Kill);
4013
4014 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4015 .addReg(DestSub0)
4016 .addImm(AMDGPU::sub0)
4017 .addReg(DestSub1)
4018 .addImm(AMDGPU::sub1);
4019
4020 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4021
4022 // Try to legalize the operands in case we need to swap the order to keep it
4023 // valid.
4024 legalizeOperands(*LoHalf);
4025 legalizeOperands(*HiHalf);
4026
4027 // Move all users of this moved vlaue.
4028 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4029 }
4030
39584031 void SIInstrInfo::splitScalar64BitBinaryOp(
39594032 SetVectorType &Worklist, MachineInstr &Inst,
39604033 unsigned Opcode) const {
5959 static unsigned getBranchOpcode(BranchPredicate Cond);
6060 static BranchPredicate getBranchPredicate(unsigned Opcode);
6161
62 public:
6263 unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
6364 MachineRegisterInfo &MRI,
6465 MachineOperand &SuperReg,
7172 const TargetRegisterClass *SuperRC,
7273 unsigned SubIdx,
7374 const TargetRegisterClass *SubRC) const;
74
75 private:
7576 void swapOperands(MachineInstr &Inst) const;
7677
7778 void lowerScalarAbs(SetVectorType &Worklist,
8283
8384 void splitScalar64BitUnaryOp(SetVectorType &Worklist,
8485 MachineInstr &Inst, unsigned Opcode) const;
86
87 void splitScalar64BitAddSub(SetVectorType &Worklist,
88 MachineInstr &Inst) const;
8589
8690 void splitScalar64BitBinaryOp(SetVectorType &Worklist,
8791 MachineInstr &Inst, unsigned Opcode) const;
107107 let usesCustomInserter = 1;
108108 }
109109
110 // 64-bit vector move instruction. This is mainly used by the SIFoldOperands
111 // pass to enable folding of inline immediates.
110 // 64-bit vector move instruction. This is mainly used by the
111 // SIFoldOperands pass to enable folding of inline immediates.
112112 def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
113113 (ins VSrc_b64:$src0)>;
114114
145145 [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
146146 let Constraints = "$src = $vdst";
147147 }
148
149
150 let usesCustomInserter = 1, Defs = [SCC] in {
151 def S_ADD_U64_PSEUDO : SPseudoInstSI <
152 (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
153 [(set SReg_64:$vdst, (add i64:$src0, i64:$src1))]
154 >;
155
156 def S_SUB_U64_PSEUDO : SPseudoInstSI <
157 (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
158 [(set SReg_64:$vdst, (sub i64:$src0, i64:$src1))]
159 >;
160
161 def S_ADDC_U64_PSEUDO : SPseudoInstSI <(outs SReg_64:$vdst, SReg_64:$sdst),
162 (ins SSrc_b64:$src0, SSrc_b64:$src1)>;
163 def S_SUBC_U64_PSEUDO : SPseudoInstSI <(outs SReg_64:$vdst, SReg_64:$sdst),
164 (ins SSrc_b64:$src0, SSrc_b64:$src1)>;
165 } // End usesCustomInserter = 1, Defs = [SCC]
148166
149167 let usesCustomInserter = 1, SALU = 1 in {
150168 def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
162162 ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
163163 ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
164164
165 ; VI: flat_load_ushort v[[A_LO:[0-9]+]]
165166 ; VI: flat_load_ushort v[[A_HI:[0-9]+]]
166 ; VI: flat_load_ushort v[[A_LO:[0-9]+]]
167 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
167168 ; VI: flat_load_ushort v[[B_HI:[0-9]+]]
168 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
169169
170170 ; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
171171 ; VI-NOT: and
397397
398398 ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src:
399399 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
400 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
400 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]
401 ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0
401402 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
402403 %tid = call i32 @llvm.amdgcn.workitem.id.x()
403404 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
4343 ; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
4444 ; SI: buffer_load_dword [[VAL0:v[0-9]+]],
4545 ; SI: buffer_load_dword [[VAL1:v[0-9]+]],
46 ; VI: flat_load_dword [[VAL0:v[0-9]+]],
4647 ; VI: flat_load_dword [[VAL1:v[0-9]+]],
47 ; VI: flat_load_dword [[VAL0:v[0-9]+]],
4848 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
4949 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
5050 ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
5757 %tid = call i32 @llvm.r600.read.tidig.x()
5858 %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %tid
5959 %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %tid
60 %val0 = load i32, i32 addrspace(1)* %in0.gep, align 4
61 %val1 = load i32, i32 addrspace(1)* %in1.gep, align 4
60 %val0 = load volatile i32, i32 addrspace(1)* %in0.gep, align 4
61 %val1 = load volatile i32, i32 addrspace(1)* %in1.gep, align 4
6262 %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
6363 %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
6464 %add = add i32 %ctpop0, %ctpop1
260260 ; GCN-LABEL: {{^}}v_insertelement_v2i16_1:
261261 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e70000
262262 ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
263 ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]]
264 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
265263
266264 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
267265 ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
268266 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
267
268 ; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]]
269 ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[AND]]
270 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
269271
270272 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
271273 define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
344346 ; GCN-LABEL: {{^}}v_insertelement_v2f16_1:
345347 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x45000000
346348 ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
347 ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]]
348 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
349349
350350 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4500
351351 ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
352352 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
353
354 ; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]]
355 ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[AND]]
356
357 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
353358
354359 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
355360 define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
422427
423428 ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
424429 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
430 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
425431 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
426432 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
427 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
428
429 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
430433
431434 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
432435 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
451454
452455 ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
453456 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
457 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
458
454459 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
455460 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
456 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
457
458 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
459461
460462 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
461463 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
77 ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
88 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
99
10 ; VI: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
10 ; VI-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1111 ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
1212 ; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
1313 ; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
6868 ; GCN-LABEL: {{^}}mad_i64_i32_sextops_i32_i128:
6969 ; CI: v_mad_u64_u32
7070 ; CI: v_mad_u64_u32
71 ; CI: v_mad_i64_i32
7172 ; CI: v_mad_u64_u32
72 ; CI: v_mad_i64_i32
73
7374
7475 ; SI-NOT: v_mad_
7576 define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
210210 ; SI: s_mul_i32
211211 ; SI: v_mul_hi_u32
212212 ; SI: s_mul_i32
213 ; SI: s_mul_i32
213214
214215 ; SI-DAG: s_mul_i32
215216 ; SI-DAG: v_mul_hi_u32
218219 ; SI-DAG: s_mul_i32
219220 ; SI-DAG: v_mul_hi_u32
220221
221 ; SI: s_mul_i32
222 ; SI: s_mul_i32
223 ; SI: s_mul_i32
224 ; SI: s_mul_i32
225 ; SI: s_mul_i32
226
227
222 ; VI: s_mul_i32
223 ; VI: v_mad_u64_u32
228224 ; VI: s_mul_i32
229225 ; VI: v_mul_hi_u32
230226 ; VI: v_mad_u64_u32
231 ; VI: s_mul_i32
232227 ; VI: v_mul_hi_u32
233228 ; VI: v_mad_u64_u32
234 ; VI: v_mad_u64_u32
229
235230
236231
237232 ; GCN: buffer_store_dwordx4
245240 ; GCN: {{buffer|flat}}_load_dwordx4
246241 ; GCN: {{buffer|flat}}_load_dwordx4
247242
248 ; GCN-DAG: v_mul_lo_i32
249 ; GCN-DAG: v_mul_hi_u32
250 ; GCN-DAG: v_mul_hi_u32
251 ; GCN-DAG: v_mul_lo_i32
252 ; GCN-DAG: v_mul_hi_u32
253 ; GCN-DAG: v_mul_hi_u32
254 ; GCN-DAG: v_mul_lo_i32
255 ; GCN-DAG: v_mul_lo_i32
256 ; GCN-DAG: v_add_i32_e32
257
258 ; SI-DAG: v_mul_hi_u32
259 ; SI-DAG: v_mul_lo_i32
260 ; SI-DAG: v_mul_hi_u32
261 ; SI-DAG: v_mul_lo_i32
262 ; SI-DAG: v_mul_lo_i32
263 ; SI-DAG: v_mul_lo_i32
264 ; SI-DAG: v_mul_lo_i32
265 ; SI-DAG: v_mul_lo_i32
266
267 ; VI-DAG: v_mad_u64_u32
243 ; SI-DAG: v_mul_lo_i32
244 ; SI-DAG: v_mul_hi_u32
245 ; SI-DAG: v_mul_hi_u32
246 ; SI-DAG: v_mul_lo_i32
247 ; SI-DAG: v_mul_hi_u32
248 ; SI-DAG: v_mul_hi_u32
249 ; SI-DAG: v_mul_lo_i32
250 ; SI-DAG: v_mul_lo_i32
251 ; SI-DAG: v_add_i32_e32
252
253 ; SI-DAG: v_mul_hi_u32
254 ; SI-DAG: v_mul_lo_i32
255 ; SI-DAG: v_mul_hi_u32
256 ; SI-DAG: v_mul_lo_i32
257 ; SI-DAG: v_mul_lo_i32
258 ; SI-DAG: v_mul_lo_i32
259 ; SI-DAG: v_mul_lo_i32
260 ; SI-DAG: v_mul_lo_i32
261
262 ; VI-DAG: v_mul_lo_i32
263 ; VI-DAG: v_mul_hi_u32
264 ; VI: v_mad_u64_u32
268265 ; VI: v_mad_u64_u32
269266 ; VI: v_mad_u64_u32
270267
66 ; SI: NumVgprs: {{[1-9]$}}
77
88 ; stores may alias loads
9 ; VI: NumSgprs: {{[1-5][0-9]$}}
9 ; VI: NumSgprs: {{[0-9]$}}
1010 ; VI: NumVgprs: {{[1-3][0-9]$}}
1111
1212 define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) {
66 ; set in vcc, which is undefined since the low scalar half add sets
77 ; scc instead.
88
9 ; FIXME: SIShrinkInstructions should force immediate fold.
10
911 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0:
10 ; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x18f, v{{[0-9]+}}
12 ; SI: s_movk_i32 [[K:s[0-9]+]], 0x18f
13 ; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, [[K]], v{{[0-9]+}}
1114 ; SI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc
1215 define amdgpu_kernel void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) {
1316 %v.val = load volatile i32, i32 addrspace(1)* %in
8484
8585 ; FIXME: Need to handle non-uniform case for function below (load without gep).
8686 ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64:
87 ; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
8887 ; VI: flat_load_ushort [[A:v[0-9]+]]
8988 ; VI: flat_load_ushort [[B:v[0-9]+]]
89 ; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
9090 ; VI-DAG: v_sub_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
9191 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
9292 define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
5656 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
5757 define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
5858 %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
59 %a = load i16, i16 addrspace(1)* %in
59 %a = load i16, i16 addrspace(1)* %in
6060 %b = load i16, i16 addrspace(1)* %b_ptr
6161 %result = sub i16 %a, %b
6262 store i16 %result, i16 addrspace(1)* %out
7070
7171 define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
7272 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
73 %a = load <2 x i16>, <2 x i16> addrspace(1) * %in
73 %a = load <2 x i16>, <2 x i16> addrspace(1) * %in
7474 %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr
7575 %result = sub <2 x i16> %a, %b
7676 store <2 x i16> %result, <2 x i16> addrspace(1)* %out
8686
8787 define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
8888 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
89 %a = load <4 x i16>, <4 x i16> addrspace(1) * %in
89 %a = load <4 x i16>, <4 x i16> addrspace(1) * %in
9090 %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr
9191 %result = sub <4 x i16> %a, %b
9292 store <4 x i16> %result, <4 x i16> addrspace(1)* %out
145145 }
146146
147147 ; FUNC-LABEL: {{^}}v_test_sub_v4i64:
148 ; SI: v_subrev_i32_e32
148 ; SI: v_sub_i32_e32
149149 ; SI: v_subb_u32_e32
150 ; SI: v_subrev_i32_e32
150 ; SI: v_sub_i32_e32
151151 ; SI: v_subb_u32_e32
152 ; SI: v_subrev_i32_e32
152 ; SI: v_sub_i32_e32
153153 ; SI: v_subb_u32_e32
154 ; SI: v_subrev_i32_e32
154 ; SI: v_sub_i32_e32
155155 ; SI: v_subb_u32_e32
156156 define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
157157 %tid = call i32 @llvm.r600.read.tidig.x() readnone
159159 ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
160160 ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
161161
162 ; VI: flat_load_ushort v[[A_LO:[0-9]+]]
162163 ; VI: flat_load_ushort v[[A_HI:[0-9]+]]
163 ; VI: flat_load_ushort v[[A_LO:[0-9]+]]
164
165 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
164166 ; VI: flat_load_ushort v[[B_HI:[0-9]+]]
165 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
166167
167168 ; VI: v_sub_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
168169 ; VI-NOT: and