llvm.org GIT mirror llvm / d2741da
Merging r261385: ------------------------------------------------------------------------ r261385 | thomas.stellard | 2016-02-19 16:37:25 -0800 (Fri, 19 Feb 2016) | 20 lines AMDGPU/SI: Use v_readfirstlane to legalize SMRD with VGPR base pointer Summary: Instead of trying to replace SMRD instructions with a VGPR base pointer with an equivalent MUBUF instruction, we now copy the base pointer to SGPRs using v_readfirstlane. This is safe to do, because any load selected as an SMRD instruction has been proven to have a uniform base pointer, so each thread in the wave will have the same pointer value in VGPRs. This will fix some errors on VI from trying to replace SMRD instructions with addr64-enabled MUBUF instructions that don't exist. Reviewers: arsenm, cfang, nhaehnle Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D17305 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@271700 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 3 years ago
5 changed file(s) with 45 addition(s) and 254 deletion(s). Raw diff Collapse all Expand all
16231623 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
16241624 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
16251625 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
1626 case AMDGPU::S_LOAD_DWORD_IMM:
1627 case AMDGPU::S_LOAD_DWORD_SGPR:
1628 case AMDGPU::S_LOAD_DWORD_IMM_ci:
1629 return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
1630 case AMDGPU::S_LOAD_DWORDX2_IMM:
1631 case AMDGPU::S_LOAD_DWORDX2_SGPR:
1632 case AMDGPU::S_LOAD_DWORDX2_IMM_ci:
1633 return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
1634 case AMDGPU::S_LOAD_DWORDX4_IMM:
1635 case AMDGPU::S_LOAD_DWORDX4_SGPR:
1636 case AMDGPU::S_LOAD_DWORDX4_IMM_ci:
1637 return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
16381626 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
16391627 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
16401628 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
19931981 return DstReg;
19941982 }
19951983
1984 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
1985 MachineInstr *MI) const {
1986
1987 // If the pointer is store in VGPRs, then we need to move them to
1988 // SGPRs using v_readfirstlane. This is safe because we only select
1989 // loads with uniform pointers to SMRD instruction so we know the
1990 // pointer value is uniform.
1991 MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase);
1992 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
1993 unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
1994 SBase->setReg(SGPR);
1995 }
1996 }
1997
19961998 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
19971999 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
19982000
20052007 // Legalize VOP3
20062008 if (isVOP3(*MI)) {
20072009 legalizeOperandsVOP3(MRI, MI);
2010 return;
2011 }
2012
2013 // Legalize SMRD
2014 if (isSMRD(*MI)) {
2015 legalizeOperandsSMRD(MRI, MI);
20082016 return;
20092017 }
20102018
22802288 }
22812289 }
22822290
2283 void SIInstrInfo::splitSMRD(MachineInstr *MI,
2284 const TargetRegisterClass *HalfRC,
2285 unsigned HalfImmOp, unsigned HalfSGPROp,
2286 MachineInstr *&Lo, MachineInstr *&Hi) const {
2287
2288 DebugLoc DL = MI->getDebugLoc();
2289 MachineBasicBlock *MBB = MI->getParent();
2290 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
2291 unsigned RegLo = MRI.createVirtualRegister(HalfRC);
2292 unsigned RegHi = MRI.createVirtualRegister(HalfRC);
2293 unsigned HalfSize = HalfRC->getSize();
2294 const MachineOperand *OffOp =
2295 getNamedOperand(*MI, AMDGPU::OpName::offset);
2296 const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase);
2297
2298 // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes
2299 // on VI.
2300
2301 bool IsKill = SBase->isKill();
2302 if (OffOp) {
2303 bool isVI =
2304 MBB->getParent()->getSubtarget().getGeneration() >=
2305 AMDGPUSubtarget::VOLCANIC_ISLANDS;
2306 unsigned OffScale = isVI ? 1 : 4;
2307 // Handle the _IMM variant
2308 unsigned LoOffset = OffOp->getImm() * OffScale;
2309 unsigned HiOffset = LoOffset + HalfSize;
2310 Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo)
2311 // Use addReg instead of addOperand
2312 // to make sure kill flag is cleared.
2313 .addReg(SBase->getReg(), 0, SBase->getSubReg())
2314 .addImm(LoOffset / OffScale);
2315
2316 if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) {
2317 unsigned OffsetSGPR =
2318 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2319 BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR)
2320 .addImm(HiOffset); // The offset in register is in bytes.
2321 Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
2322 .addReg(SBase->getReg(), getKillRegState(IsKill),
2323 SBase->getSubReg())
2324 .addReg(OffsetSGPR);
2325 } else {
2326 Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi)
2327 .addReg(SBase->getReg(), getKillRegState(IsKill),
2328 SBase->getSubReg())
2329 .addImm(HiOffset / OffScale);
2330 }
2331 } else {
2332 // Handle the _SGPR variant
2333 MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff);
2334 Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo)
2335 .addReg(SBase->getReg(), 0, SBase->getSubReg())
2336 .addOperand(*SOff);
2337 unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2338 BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR)
2339 .addReg(SOff->getReg(), 0, SOff->getSubReg())
2340 .addImm(HalfSize);
2341 Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
2342 .addReg(SBase->getReg(), getKillRegState(IsKill),
2343 SBase->getSubReg())
2344 .addReg(OffsetSGPR);
2345 }
2346
2347 unsigned SubLo, SubHi;
2348 const TargetRegisterClass *NewDstRC;
2349 switch (HalfSize) {
2350 case 4:
2351 SubLo = AMDGPU::sub0;
2352 SubHi = AMDGPU::sub1;
2353 NewDstRC = &AMDGPU::VReg_64RegClass;
2354 break;
2355 case 8:
2356 SubLo = AMDGPU::sub0_sub1;
2357 SubHi = AMDGPU::sub2_sub3;
2358 NewDstRC = &AMDGPU::VReg_128RegClass;
2359 break;
2360 case 16:
2361 SubLo = AMDGPU::sub0_sub1_sub2_sub3;
2362 SubHi = AMDGPU::sub4_sub5_sub6_sub7;
2363 NewDstRC = &AMDGPU::VReg_256RegClass;
2364 break;
2365 case 32:
2366 SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
2367 SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15;
2368 NewDstRC = &AMDGPU::VReg_512RegClass;
2369 break;
2370 default:
2371 llvm_unreachable("Unhandled HalfSize");
2372 }
2373
2374 unsigned OldDst = MI->getOperand(0).getReg();
2375 unsigned NewDst = MRI.createVirtualRegister(NewDstRC);
2376
2377 MRI.replaceRegWith(OldDst, NewDst);
2378
2379 BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDst)
2380 .addReg(RegLo)
2381 .addImm(SubLo)
2382 .addReg(RegHi)
2383 .addImm(SubHi);
2384 }
2385
2386 void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI,
2387 MachineRegisterInfo &MRI,
2388 SmallVectorImpl &Worklist) const {
2389 MachineBasicBlock *MBB = MI->getParent();
2390 int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
2391 assert(DstIdx != -1);
2392 unsigned DstRCID = get(MI->getOpcode()).OpInfo[DstIdx].RegClass;
2393 switch(RI.getRegClass(DstRCID)->getSize()) {
2394 case 4:
2395 case 8:
2396 case 16: {
2397 unsigned NewOpcode = getVALUOp(*MI);
2398 unsigned RegOffset;
2399 unsigned ImmOffset;
2400
2401 if (MI->getOperand(2).isReg()) {
2402 RegOffset = MI->getOperand(2).getReg();
2403 ImmOffset = 0;
2404 } else {
2405 assert(MI->getOperand(2).isImm());
2406 // SMRD instructions take a dword offsets on SI and byte offset on VI
2407 // and MUBUF instructions always take a byte offset.
2408 ImmOffset = MI->getOperand(2).getImm();
2409 if (MBB->getParent()->getSubtarget().getGeneration() <=
2410 AMDGPUSubtarget::SEA_ISLANDS)
2411 ImmOffset <<= 2;
2412 RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2413
2414 if (isUInt<12>(ImmOffset)) {
2415 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
2416 RegOffset)
2417 .addImm(0);
2418 } else {
2419 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
2420 RegOffset)
2421 .addImm(ImmOffset);
2422 ImmOffset = 0;
2423 }
2424 }
2425
2426 unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
2427 unsigned DWord0 = RegOffset;
2428 unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2429 unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2430 unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2431 uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
2432
2433 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1)
2434 .addImm(0);
2435 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2)
2436 .addImm(RsrcDataFormat & 0xFFFFFFFF);
2437 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
2438 .addImm(RsrcDataFormat >> 32);
2439 BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
2440 .addReg(DWord0)
2441 .addImm(AMDGPU::sub0)
2442 .addReg(DWord1)
2443 .addImm(AMDGPU::sub1)
2444 .addReg(DWord2)
2445 .addImm(AMDGPU::sub2)
2446 .addReg(DWord3)
2447 .addImm(AMDGPU::sub3);
2448
2449 const MCInstrDesc &NewInstDesc = get(NewOpcode);
2450 const TargetRegisterClass *NewDstRC
2451 = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass);
2452 unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
2453 unsigned DstReg = MI->getOperand(0).getReg();
2454 MRI.replaceRegWith(DstReg, NewDstReg);
2455
2456 MachineInstr *NewInst =
2457 BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg)
2458 .addOperand(MI->getOperand(1)) // sbase
2459 .addReg(SRsrc)
2460 .addImm(0)
2461 .addImm(ImmOffset)
2462 .addImm(0) // glc
2463 .addImm(0) // slc
2464 .addImm(0) // tfe
2465 .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
2466 MI->eraseFromParent();
2467
2468 legalizeOperands(NewInst);
2469 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
2470 break;
2471 }
2472 case 32: {
2473 MachineInstr *Lo, *Hi;
2474 addUsersToMoveToVALUWorklist(MI->getOperand(0).getReg(), MRI, Worklist);
2475 splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM,
2476 AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi);
2477 MI->eraseFromParent();
2478 moveSMRDToVALU(Lo, MRI, Worklist);
2479 moveSMRDToVALU(Hi, MRI, Worklist);
2480 break;
2481 }
2482
2483 case 64: {
2484 MachineInstr *Lo, *Hi;
2485 addUsersToMoveToVALUWorklist(MI->getOperand(0).getReg(), MRI, Worklist);
2486 splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM,
2487 AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi);
2488 MI->eraseFromParent();
2489 moveSMRDToVALU(Lo, MRI, Worklist);
2490 moveSMRDToVALU(Hi, MRI, Worklist);
2491 break;
2492 }
2493 }
2494 }
2495
24962291 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
24972292 SmallVector Worklist;
24982293 Worklist.push_back(&TopInst);
25082303 // Handle some special cases
25092304 switch (Opcode) {
25102305 default:
2511 if (isSMRD(*Inst)) {
2512 moveSMRDToVALU(Inst, MRI, Worklist);
2513 continue;
2514 }
25152306 break;
25162307 case AMDGPU::S_AND_B64:
25172308 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
407407 unsigned readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr *UseMI,
408408 MachineRegisterInfo &MRI) const;
409409
410 void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr *MI) const;
411
410412 /// \brief Legalize all operands in this instruction. This function may
411413 /// create new instruction and insert them before \p MI.
412414 void legalizeOperands(MachineInstr *MI) const;
413
414 /// \brief Split an SMRD instruction into two smaller loads of half the
415 // size storing the results in \p Lo and \p Hi.
416 void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC,
417 unsigned HalfImmOp, unsigned HalfSGPROp,
418 MachineInstr *&Lo, MachineInstr *&Hi) const;
419
420 void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI,
421 SmallVectorImpl &Worklist) const;
422415
423416 /// \brief Replace this instruction's opcode with the equivalent VALU
424417 /// opcode. This function will also move the users of \p MI to the
77 ; FUNC-LABEL: {{^}}missing_store_reduced:
88 ; SI: ds_read_b64
99 ; SI: buffer_store_dword
10 ; SI: buffer_load_dword
10 ; SI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
11 ; SI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
12 ; SI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
1113 ; SI: buffer_store_dword
1214 ; SI: s_endpgm
1315 define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
5252 ; Test moving an SMRD instruction to the VALU
5353
5454 ; GCN-LABEL: {{^}}smrd_valu:
55 ; FIXME: We should be using flat load for HSA.
56 ; GCN: buffer_load_dword [[OUT:v[0-9]+]]
57 ; GCN-NOHSA: buffer_store_dword [[OUT]]
58 ; GCN-HSA: flat_store_dword [[OUT]]
55 ; SI-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0
56 ; GCN-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
57 ; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
58 ; SI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]]
59 ; CI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xbb8
60 ; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]]
61 ; GCN-NOHSA: buffer_store_dword [[V_OUT]]
62 ; GCN-HSA: flat_store_dword [[V_OUT]]
5963 define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
6064 entry:
6165 %tmp = icmp ne i32 %a, 0
6969 ret void
7070 }
7171
72 ; Technically we could reorder these, but just comparing the
73 ; instruction type of the load is insufficient.
74
75 ; FUNC-LABEL: @no_reorder_constant_load_global_store_constant_load
76 ; CI: buffer_load_dword
77 ; CI: buffer_store_dword
78 ; CI: buffer_load_dword
79 ; CI: buffer_store_dword
80 define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
72 ; FUNC-LABEL: @reorder_constant_load_global_store_constant_load
73 ; CI: buffer_store_dword
74 ; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
75 ; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
76 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
77 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x2
78 ; CI: buffer_store_dword
79 define void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
8180 %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
8281
8382 %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
9493 }
9594
9695 ; FUNC-LABEL: @reorder_constant_load_local_store_constant_load
97 ; CI: buffer_load_dword
98 ; CI: buffer_load_dword
96 ; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
97 ; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
98 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
99 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x2
99100 ; CI: ds_write_b32
100101 ; CI: buffer_store_dword
101102 define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {