llvm.org GIT mirror llvm / e24854a
[AMDGPU] added writelane intrinsic Summary: For use by LLPC SPV_AMD_shader_ballot extension. The v_writelane instruction was already implemented for use by SGPR spilling, but I had to add an extra dummy operand tied to the destination, to represent that all lanes except the selected one keep the old value of the destination register. .ll test changes were due to schedule changes caused by that new operand. Differential Revision: https://reviews.llvm.org/D42838 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@326353 91177308-0d34-0410-b5e6-96231b3b80d8 Tim Renouf 1 year, 11 months ago
9 changed file(s) with 156 addition(s) and 23 deletion(s). Raw diff Collapse all Expand all
767767 GCCBuiltin<"__builtin_amdgcn_readlane">,
768768 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
769769
770 // The value to write and lane select arguments must be uniform across the
771 // currently active threads of the current wave. Otherwise, the result is
772 // undefined.
773 def int_amdgcn_writelane :
774 GCCBuiltin<"__builtin_amdgcn_writelane">,
775 Intrinsic<[llvm_i32_ty], [
776 llvm_i32_ty, // uniform value to write: returned by the selected lane
777 llvm_i32_ty, // uniform lane select
778 llvm_i32_ty // returned by all lanes other than the selected one
779 ],
780 [IntrNoMem, IntrConvergent]
781 >;
782
770783 def int_amdgcn_alignbit : Intrinsic<[llvm_i32_ty],
771784 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
772785 [IntrNoMem, IntrSpeculatable]
27102710 }
27112711 }
27122712
2713 // Verify VOP*
2714 if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) {
2713 // Verify VOP*. Ignore multiple sgpr operands on writelane.
2714 if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
2715 && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
27152716 // Only look at the true operands. Only a real operand can use the constant
27162717 // bus, and we don't want to check pseudo-operands like the source modifier
27172718 // flags.
31463147 legalizeOpWithMove(MI, Src0Idx);
31473148 }
31483149
3150 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
3151 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
3152 // src0/src1 with V_READFIRSTLANE.
3153 if (Opc == AMDGPU::V_WRITELANE_B32) {
3154 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3155 MachineOperand &Src0 = MI.getOperand(Src0Idx);
3156 const DebugLoc &DL = MI.getDebugLoc();
3157 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
3158 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3159 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3160 .add(Src0);
3161 Src0.ChangeToRegister(Reg, false);
3162 }
3163 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
3164 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3165 const DebugLoc &DL = MI.getDebugLoc();
3166 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3167 .add(Src1);
3168 Src1.ChangeToRegister(Reg, false);
3169 }
3170 return;
3171 }
3172
31493173 // VOP2 src0 instructions support all operand types, so we don't need to check
31503174 // their legality. If src1 is already legal, we don't need to do anything.
31513175 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
635635 MachineBasicBlock *MBB = MI->getParent();
636636 MachineFunction *MF = MBB->getParent();
637637 SIMachineFunctionInfo *MFI = MF->getInfo();
638 DenseSet SGPRSpillVGPRDefinedSet;
638639
639640 ArrayRef VGPRSpills
640641 = MFI->getSGPRToVGPRSpills(Index);
731732 if (SpillToVGPR) {
732733 SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
733734
735 // During SGPR spilling to VGPR, determine if the VGPR is defined. The
736 // only circumstance in which we say it is undefined is when it is the
737 // first spill to this VGPR in the first basic block.
738 bool VGPRDefined = true;
739 if (MBB == &MF->front())
740 VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second;
741
742 // Mark the "old value of vgpr" input undef only if this is the first sgpr
743 // spill to this specific vgpr in the first basic block.
734744 BuildMI(*MBB, MI, DL,
735745 TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
736746 Spill.VGPR)
737747 .addReg(SubReg, getKillRegState(IsKill))
738 .addImm(Spill.Lane);
748 .addImm(Spill.Lane)
749 .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef);
739750
740751 // FIXME: Since this spills to another register instead of an actual
741752 // frame index, we should delete the frame index when all references to
322322 let HasSDWA9 = 0;
323323 }
324324
325 def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
325 def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
326326 let Outs32 = (outs VGPR_32:$vdst);
327327 let Outs64 = Outs32;
328 let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1);
328 let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in);
329329 let Ins64 = Ins32;
330330 let Asm32 = " $vdst, $src0, $src1";
331331 let Asm64 = Asm32;
332332 let HasExt = 0;
333333 let HasSDWA9 = 0;
334 let HasSrc2 = 0;
335 let HasSrc2Mods = 0;
334336 }
335337
336338 //===----------------------------------------------------------------------===//
398400 def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
399401 [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">;
400402
401 def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">;
403 let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
404 def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
405 [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))], "">;
406 } // End $vdst = $vdst_in, DisableEncoding $vdst_in
402407 } // End isConvergent = 1
403408
404409 defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT>;
639644
640645 defm V_READLANE_B32 : VOP2_Real_si <0x01>;
641646
642 let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1) in {
647 let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
643648 defm V_WRITELANE_B32 : VOP2_Real_si <0x02>;
644649 }
645650
3232 ; GCN-DAG: buffer_store_dword v32
3333 ; GCN-DAG: buffer_store_dword v33
3434 ; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
35 ; GCN: v_writelane_b32
36
35 ; GCN-DAG: v_writelane_b32
3736 ; GCN-DAG: s_add_u32 s32, s32, 0xb00{{$}}
38
3937 ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
40 ; GCN: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
41 ; GCN: buffer_store_dword [[ADD0]], off, s[0:3], s5 offset:4{{$}}
42
43 ; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:20{{$}}
44 ; GCN: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]]
38 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
39 ; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5 offset:4{{$}}
40
41 ; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:20{{$}}
42 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]]
4543
4644 ; GCN: s_swappc_b64
4745
7977 ; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8
8078 ; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24
8179
82 ; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8
83 ; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12
84 ; GCN: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16
85 ; GCN: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20
80 ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8
81 ; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12
82 ; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16
83 ; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20
8684
8785 ; GCN-NOT: s_add_u32 s32, s32, 0x800
8886
4343 ; GCN-DAG: v_writelane_b32 v32, s35,
4444 ; GCN-DAG: s_add_u32 s32, s32, 0x300{{$}}
4545 ; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
46 ; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
46 ; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
4747 ; GCN-DAG: s_mov_b32 s33, s5
4848
4949
307307
308308 bb.1:
309309 $vgpr0,$sgpr0_sgpr1 = V_ADD_I32_e64 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
310 $vgpr4 = V_WRITELANE_B32 $sgpr0, $sgpr0
310 $vgpr4 = V_WRITELANE_B32 $sgpr0, $sgpr0, $vgpr4
311311 S_BRANCH %bb.2
312312
313313 bb.2:
317317
318318 bb.3:
319319 $vgpr0,implicit $vcc = V_ADD_I32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
320 $vgpr4 = V_WRITELANE_B32 $sgpr4, $vcc_lo
320 $vgpr4 = V_WRITELANE_B32 $sgpr4, $vcc_lo, $vgpr4
321321 S_ENDPGM
322322
323323 ...
0 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck %s
2
3 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
4
5 ; CHECK-LABEL: {{^}}test_writelane_sreg:
6 ; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
7 define amdgpu_kernel void @test_writelane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
8 %oldval = load i32, i32 addrspace(1)* %out
9 %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
10 store i32 %writelane, i32 addrspace(1)* %out, align 4
11 ret void
12 }
13
14 ; CHECK-LABEL: {{^}}test_writelane_imm_sreg:
15 ; CHECK: v_writelane_b32 v{{[0-9]+}}, 32, s{{[0-9]+}}
16 define amdgpu_kernel void @test_writelane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
17 %oldval = load i32, i32 addrspace(1)* %out
18 %writelane = call i32 @llvm.amdgcn.writelane(i32 32, i32 %src1, i32 %oldval)
19 store i32 %writelane, i32 addrspace(1)* %out, align 4
20 ret void
21 }
22
23 ; CHECK-LABEL: {{^}}test_writelane_vreg_lane:
24 ; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
25 ; CHECK: v_writelane_b32 v{{[0-9]+}}, 12, [[LANE]]
26 define amdgpu_kernel void @test_writelane_vreg_lane(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #1 {
27 %tid = call i32 @llvm.amdgcn.workitem.id.x()
28 %gep.in = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid
29 %args = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in
30 %oldval = load i32, i32 addrspace(1)* %out
31 %lane = extractelement <2 x i32> %args, i32 1
32 %writelane = call i32 @llvm.amdgcn.writelane(i32 12, i32 %lane, i32 %oldval)
33 store i32 %writelane, i32 addrspace(1)* %out, align 4
34 ret void
35 }
36
37 ; TODO: m0 should be folded.
38 ; CHECK-LABEL: {{^}}test_writelane_m0_sreg:
39 ; CHECK: s_mov_b32 m0, -1
40 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
41 ; CHECK: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], s{{[0-9]+}}
42 define amdgpu_kernel void @test_writelane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
43 %oldval = load i32, i32 addrspace(1)* %out
44 %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()
45 %writelane = call i32 @llvm.amdgcn.writelane(i32 %m0, i32 %src1, i32 %oldval)
46 store i32 %writelane, i32 addrspace(1)* %out, align 4
47 ret void
48 }
49
50 ; CHECK-LABEL: {{^}}test_writelane_imm:
51 ; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32
52 define amdgpu_kernel void @test_writelane_imm(i32 addrspace(1)* %out, i32 %src0) #1 {
53 %oldval = load i32, i32 addrspace(1)* %out
54 %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 32, i32 %oldval) #0
55 store i32 %writelane, i32 addrspace(1)* %out, align 4
56 ret void
57 }
58
59 ; CHECK-LABEL: {{^}}test_writelane_sreg_oldval:
60 ; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], s{{[0-9]+}}
61 ; CHECK: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
62 define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
63 %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
64 store i32 %writelane, i32 addrspace(1)* %out, align 4
65 ret void
66 }
67
68 ; CHECK-LABEL: {{^}}test_writelane_imm_oldval:
69 ; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], 42
70 ; CHECK: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
71 define amdgpu_kernel void @test_writelane_imm_oldval(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
72 %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 42)
73 store i32 %writelane, i32 addrspace(1)* %out, align 4
74 ret void
75 }
76
77 declare i32 @llvm.amdgcn.workitem.id.x() #2
78
79 attributes #0 = { nounwind readnone convergent }
80 attributes #1 = { nounwind }
81 attributes #2 = { nounwind readnone }
215215 ; GCN-DAG: v_writelane_b32 v34, s35, 2
216216 ; GCN-DAG: s_add_u32 s32, s32, 0x400
217217
218 ; GCN: s_getpc_b64
218 ; GCN-DAG: s_getpc_b64
219219 ; GCN: s_swappc_b64
220220
221221 ; GCN: s_getpc_b64 s[6:7]