llvm.org GIT mirror llvm / e5d3d15
AMDGPU/GlobalISel: Implement custom kernel arg lowering Avoid using allocateKernArg / AssignFn. We do not want any of the type splitting properties of normal calling convention lowering. For now at least this exists alongside the IR argument lowering pass. This is necessary to handle struct padding correctly while some arguments are still skipped by the IR argument lowering pass. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@336373 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 2 years ago
6 changed file(s) with 830 addition(s) and 53 deletion(s). Raw diff Collapse all Expand all
4242
4343 unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
4444 Type *ParamTy,
45 unsigned Offset) const {
45 uint64_t Offset) const {
4646
4747 MachineFunction &MF = MIRBuilder.getMF();
4848 const SIMachineFunctionInfo *MFI = MF.getInfo();
6565 }
6666
6767 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
68 Type *ParamTy, unsigned Offset,
68 Type *ParamTy, uint64_t Offset,
69 unsigned Align,
6970 unsigned DstReg) const {
7071 MachineFunction &MF = MIRBuilder.getMF();
7172 const Function &F = MF.getFunction();
7374 PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
7475 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
7576 unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
76 unsigned Align = DL.getABITypeAlignment(ParamTy);
7777 unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
7878
7979 MachineMemOperand *MMO =
9494 return false;
9595
9696 MachineFunction &MF = MIRBuilder.getMF();
97 const SISubtarget *Subtarget = static_cast(&MF.getSubtarget());
97 const SISubtarget *Subtarget = &MF.getSubtarget();
9898 MachineRegisterInfo &MRI = MF.getRegInfo();
9999 SIMachineFunctionInfo *Info = MF.getInfo();
100100 const SIRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
142142 unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
143143 // FIXME: Need to add reg as live-in
144144 CCInfo.AllocateReg(FlatScratchInitReg);
145 }
146
147 // The infrastructure for normal calling convention lowering is essentially
148 // useless for kernels. We want to avoid any kind of legalization or argument
149 // splitting.
150 if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) {
151 unsigned i = 0;
152 const unsigned KernArgBaseAlign = 16;
153 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
154 uint64_t ExplicitArgOffset = 0;
155
156 // TODO: Align down to dword alignment and extract bits for extending loads.
157 for (auto &Arg : F.args()) {
158 Type *ArgTy = Arg.getType();
159 unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
160 if (AllocSize == 0)
161 continue;
162
163 unsigned ABIAlign = DL.getABITypeAlignment(ArgTy);
164
165 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
166 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
167
168 unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
169 ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
170 lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, VRegs[i]);
171 ++i;
172 }
173
174 return true;
145175 }
146176
147177 unsigned NumArgs = F.arg_size();
215245 return true;
216246 }
217247
218 for (unsigned i = 0; i != ArgLocs.size(); ++i, ++Arg) {
219 // FIXME: We should be getting DebugInfo from the arguments some how.
220 CCValAssign &VA = ArgLocs[i];
221 lowerParameter(MIRBuilder, Arg->getType(),
222 VA.getLocMemOffset() +
223 Subtarget->getExplicitKernelArgOffset(F), VRegs[i]);
224 }
225
226 return true;
227 }
248 return false;
249 }
2525 AMDGPUAS AMDGPUASI;
2626
2727 unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
28 unsigned Offset) const;
28 uint64_t Offset) const;
2929
3030 void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy,
31 unsigned Offset, unsigned DstReg) const;
31 uint64_t Offset, unsigned Align,
32 unsigned DstReg) const;
3233
3334 public:
3435 AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
8484 ]>>
8585 ]>;
8686
87 // Calling convention for compute kernels
88 def CC_AMDGPU_Kernel : CallingConv<[
89 CCCustom<"allocateKernArg">
90 ]>;
91
9287 def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs<
9388 (sequence "VGPR%u", 24, 255)
9489 >;
136131 ]>;
137132
138133 def CC_AMDGPU : CallingConv<[
139 CCIf<"static_cast"
140 "(State.getMachineFunction().getSubtarget()).getGeneration() >="
141 "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
142 "!AMDGPU::isShader(State.getCallingConv())",
143 CCDelegateTo>,
144 CCIf<"static_cast"
145 "(State.getMachineFunction().getSubtarget()).getGeneration() < "
146 "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
147 "!AMDGPU::isShader(State.getCallingConv())",
148 CCDelegateTo>,
149134 CCIf<"static_cast"
150135 "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
151136 "AMDGPUSubtarget::SOUTHERN_ISLANDS",
842842 switch (CC) {
843843 case CallingConv::AMDGPU_KERNEL:
844844 case CallingConv::SPIR_KERNEL:
845 return CC_AMDGPU_Kernel;
845 llvm_unreachable("kernels should not be handled here");
846846 case CallingConv::AMDGPU_VS:
847847 case CallingConv::AMDGPU_GS:
848848 case CallingConv::AMDGPU_PS:
865865 switch (CC) {
866866 case CallingConv::AMDGPU_KERNEL:
867867 case CallingConv::SPIR_KERNEL:
868 return CC_AMDGPU_Kernel;
868 llvm_unreachable("kernels should not be handled here");
869869 case CallingConv::AMDGPU_VS:
870870 case CallingConv::AMDGPU_GS:
871871 case CallingConv::AMDGPU_PS:
0 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
1 ; REQUIRES: global-isel
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -O0 -amdgpu-ir-lower-kernel-arguments=0 -stop-after=irtranslator -global-isel %s -o - | FileCheck -check-prefix=HSA-VI %s
3
4 define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
5 ; HSA-VI-LABEL: name: i8_arg
6 ; HSA-VI: bb.1 (%ir-block.0):
7 ; HSA-VI: liveins: $sgpr4_sgpr5
8 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
9 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
10 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
11 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
12 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
13 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
14 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 8, addrspace 4)
15 ; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
16 ; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
17 ; HSA-VI: S_ENDPGM
18 %ext = zext i8 %in to i32
19 store i32 %ext, i32 addrspace(1)* %out, align 4
20 ret void
21 }
22
23 define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
24 ; HSA-VI-LABEL: name: i8_zext_arg
25 ; HSA-VI: bb.1 (%ir-block.0):
26 ; HSA-VI: liveins: $sgpr4_sgpr5
27 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
28 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
29 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
30 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
31 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
32 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
33 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 8, addrspace 4)
34 ; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
35 ; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
36 ; HSA-VI: S_ENDPGM
37 %ext = zext i8 %in to i32
38 store i32 %ext, i32 addrspace(1)* %out, align 4
39 ret void
40 }
41
42 define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
43 ; HSA-VI-LABEL: name: i8_sext_arg
44 ; HSA-VI: bb.1 (%ir-block.0):
45 ; HSA-VI: liveins: $sgpr4_sgpr5
46 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
47 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
48 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
49 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
50 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
51 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
52 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 8, addrspace 4)
53 ; HSA-VI: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s8)
54 ; HSA-VI: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
55 ; HSA-VI: S_ENDPGM
56 %ext = sext i8 %in to i32
57 store i32 %ext, i32 addrspace(1)* %out, align 4
58 ret void
59 }
60
61 define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
62 ; HSA-VI-LABEL: name: i16_arg
63 ; HSA-VI: bb.1 (%ir-block.0):
64 ; HSA-VI: liveins: $sgpr4_sgpr5
65 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
66 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
67 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
68 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
69 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
70 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
71 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 2 from `i16 addrspace(4)* undef`, align 8, addrspace 4)
72 ; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
73 ; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
74 ; HSA-VI: S_ENDPGM
75 %ext = zext i16 %in to i32
76 store i32 %ext, i32 addrspace(1)* %out, align 4
77 ret void
78 }
79
80 define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
81 ; HSA-VI-LABEL: name: i16_zext_arg
82 ; HSA-VI: bb.1 (%ir-block.0):
83 ; HSA-VI: liveins: $sgpr4_sgpr5
84 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
85 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
86 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
87 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
88 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
89 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
90 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 2 from `i16 addrspace(4)* undef`, align 8, addrspace 4)
91 ; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
92 ; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
93 ; HSA-VI: S_ENDPGM
94 %ext = zext i16 %in to i32
95 store i32 %ext, i32 addrspace(1)* %out, align 4
96 ret void
97 }
98
99 define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
100 ; HSA-VI-LABEL: name: i16_sext_arg
101 ; HSA-VI: bb.1 (%ir-block.0):
102 ; HSA-VI: liveins: $sgpr4_sgpr5
103 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
104 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
105 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
106 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
107 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
108 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
109 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 2 from `i16 addrspace(4)* undef`, align 8, addrspace 4)
110 ; HSA-VI: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s16)
111 ; HSA-VI: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
112 ; HSA-VI: S_ENDPGM
113 %ext = sext i16 %in to i32
114 store i32 %ext, i32 addrspace(1)* %out, align 4
115 ret void
116 }
117
118 define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
119 ; HSA-VI-LABEL: name: i32_arg
120 ; HSA-VI: bb.1.entry:
121 ; HSA-VI: liveins: $sgpr4_sgpr5
122 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
123 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
124 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
125 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
126 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
127 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
128 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
129 ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
130 ; HSA-VI: S_ENDPGM
131 entry:
132 store i32 %in, i32 addrspace(1)* %out, align 4
133 ret void
134 }
135
136 define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
137 ; HSA-VI-LABEL: name: f32_arg
138 ; HSA-VI: bb.1.entry:
139 ; HSA-VI: liveins: $sgpr4_sgpr5
140 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
141 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
142 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
143 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `float addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
144 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
145 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
146 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 4 from `float addrspace(4)* undef`, align 8, addrspace 4)
147 ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
148 ; HSA-VI: S_ENDPGM
149 entry:
150 store float %in, float addrspace(1)* %out, align 4
151 ret void
152 }
153
154 define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
155 ; HSA-VI-LABEL: name: v2i8_arg
156 ; HSA-VI: bb.1.entry:
157 ; HSA-VI: liveins: $sgpr4_sgpr5
158 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
159 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
160 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
161 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<2 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
162 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
163 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
164 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 2 from `<2 x i8> addrspace(4)* undef`, align 8, addrspace 4)
165 ; HSA-VI: G_STORE [[LOAD1]](<2 x s8>), [[LOAD]](p1) :: (store 2 into %ir.out, addrspace 1)
166 ; HSA-VI: S_ENDPGM
167 entry:
168 store <2 x i8> %in, <2 x i8> addrspace(1)* %out
169 ret void
170 }
171
172 define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
173 ; HSA-VI-LABEL: name: v2i16_arg
174 ; HSA-VI: bb.1.entry:
175 ; HSA-VI: liveins: $sgpr4_sgpr5
176 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
177 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
178 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
179 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<2 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
180 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
181 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
182 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 4 from `<2 x i16> addrspace(4)* undef`, align 8, addrspace 4)
183 ; HSA-VI: G_STORE [[LOAD1]](<2 x s16>), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
184 ; HSA-VI: S_ENDPGM
185 entry:
186 store <2 x i16> %in, <2 x i16> addrspace(1)* %out
187 ret void
188 }
189
190 define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
191 ; HSA-VI-LABEL: name: v2i32_arg
192 ; HSA-VI: bb.1.entry:
193 ; HSA-VI: liveins: $sgpr4_sgpr5
194 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
195 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
196 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
197 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<2 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
198 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
199 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
200 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `<2 x i32> addrspace(4)* undef`, addrspace 4)
201 ; HSA-VI: G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store 8 into %ir.out, align 4, addrspace 1)
202 ; HSA-VI: S_ENDPGM
203 entry:
204 store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
205 ret void
206 }
207
208 define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
209 ; HSA-VI-LABEL: name: v2f32_arg
210 ; HSA-VI: bb.1.entry:
211 ; HSA-VI: liveins: $sgpr4_sgpr5
212 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
213 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
214 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
215 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<2 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
216 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
217 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
218 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `<2 x float> addrspace(4)* undef`, addrspace 4)
219 ; HSA-VI: G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store 8 into %ir.out, align 4, addrspace 1)
220 ; HSA-VI: S_ENDPGM
221 entry:
222 store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
223 ret void
224 }
225
226 define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
227 ; HSA-VI-LABEL: name: v3i8_arg
228 ; HSA-VI: bb.1.entry:
229 ; HSA-VI: liveins: $sgpr4_sgpr5
230 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
231 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
232 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
233 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<3 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
234 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
235 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
236 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 3 from `<3 x i8> addrspace(4)* undef`, align 8, addrspace 4)
237 ; HSA-VI: G_STORE [[LOAD1]](<3 x s8>), [[LOAD]](p1) :: (store 3 into %ir.out, align 4, addrspace 1)
238 ; HSA-VI: S_ENDPGM
239 entry:
240 store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
241 ret void
242 }
243
244 define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
245 ; HSA-VI-LABEL: name: v3i16_arg
246 ; HSA-VI: bb.1.entry:
247 ; HSA-VI: liveins: $sgpr4_sgpr5
248 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
249 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
250 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
251 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<3 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
252 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
253 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
254 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 6 from `<3 x i16> addrspace(4)* undef`, align 8, addrspace 4)
255 ; HSA-VI: G_STORE [[LOAD1]](<3 x s16>), [[LOAD]](p1) :: (store 6 into %ir.out, align 4, addrspace 1)
256 ; HSA-VI: S_ENDPGM
257 entry:
258 store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
259 ret void
260 }
261
262 define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
263 ; HSA-VI-LABEL: name: v3i32_arg
264 ; HSA-VI: bb.1.entry:
265 ; HSA-VI: liveins: $sgpr4_sgpr5
266 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
267 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
268 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
269 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<3 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
270 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
271 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
272 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 12 from `<3 x i32> addrspace(4)* undef`, align 16, addrspace 4)
273 ; HSA-VI: G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store 12 into %ir.out, align 4, addrspace 1)
274 ; HSA-VI: S_ENDPGM
275 entry:
276 store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
277 ret void
278 }
279
280 define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
281 ; HSA-VI-LABEL: name: v3f32_arg
282 ; HSA-VI: bb.1.entry:
283 ; HSA-VI: liveins: $sgpr4_sgpr5
284 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
285 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
286 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
287 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<3 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
288 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
289 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
290 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 12 from `<3 x float> addrspace(4)* undef`, align 16, addrspace 4)
291 ; HSA-VI: G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store 12 into %ir.out, align 4, addrspace 1)
292 ; HSA-VI: S_ENDPGM
293 entry:
294 store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
295 ret void
296 }
297
298 define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
299 ; HSA-VI-LABEL: name: v4i8_arg
300 ; HSA-VI: bb.1.entry:
301 ; HSA-VI: liveins: $sgpr4_sgpr5
302 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
303 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
304 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
305 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<4 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
306 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
307 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
308 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<4 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 4 from `<4 x i8> addrspace(4)* undef`, align 8, addrspace 4)
309 ; HSA-VI: G_STORE [[LOAD1]](<4 x s8>), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
310 ; HSA-VI: S_ENDPGM
311 entry:
312 store <4 x i8> %in, <4 x i8> addrspace(1)* %out
313 ret void
314 }
315
316 define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
317 ; HSA-VI-LABEL: name: v4i16_arg
318 ; HSA-VI: bb.1.entry:
319 ; HSA-VI: liveins: $sgpr4_sgpr5
320 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
321 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
322 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
323 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<4 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
324 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
325 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
326 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `<4 x i16> addrspace(4)* undef`, addrspace 4)
327 ; HSA-VI: G_STORE [[LOAD1]](<4 x s16>), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
328 ; HSA-VI: S_ENDPGM
329 entry:
330 store <4 x i16> %in, <4 x i16> addrspace(1)* %out
331 ret void
332 }
333
334 define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
335 ; HSA-VI-LABEL: name: v4i32_arg
336 ; HSA-VI: bb.1.entry:
337 ; HSA-VI: liveins: $sgpr4_sgpr5
338 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
339 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
340 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
341 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<4 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
342 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
343 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
344 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 16 from `<4 x i32> addrspace(4)* undef`, addrspace 4)
345 ; HSA-VI: G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store 16 into %ir.out, align 4, addrspace 1)
346 ; HSA-VI: S_ENDPGM
347 entry:
348 store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
349 ret void
350 }
351
352 define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
353 ; HSA-VI-LABEL: name: v4f32_arg
354 ; HSA-VI: bb.1.entry:
355 ; HSA-VI: liveins: $sgpr4_sgpr5
356 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
357 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
358 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
359 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<4 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
360 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
361 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
362 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 16 from `<4 x float> addrspace(4)* undef`, addrspace 4)
363 ; HSA-VI: G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store 16 into %ir.out, align 4, addrspace 1)
364 ; HSA-VI: S_ENDPGM
365 entry:
366 store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
367 ret void
368 }
369
370 define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
371 ; HSA-VI-LABEL: name: v8i8_arg
372 ; HSA-VI: bb.1.entry:
373 ; HSA-VI: liveins: $sgpr4_sgpr5
374 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
375 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
376 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
377 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<8 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
378 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
379 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
380 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<8 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `<8 x i8> addrspace(4)* undef`, addrspace 4)
381 ; HSA-VI: G_STORE [[LOAD1]](<8 x s8>), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
382 ; HSA-VI: S_ENDPGM
383 entry:
384 store <8 x i8> %in, <8 x i8> addrspace(1)* %out
385 ret void
386 }
387
388 define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
389 ; HSA-VI-LABEL: name: v8i16_arg
390 ; HSA-VI: bb.1.entry:
391 ; HSA-VI: liveins: $sgpr4_sgpr5
392 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
393 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
394 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
395 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<8 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
396 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
397 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
398 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 16 from `<8 x i16> addrspace(4)* undef`, addrspace 4)
399 ; HSA-VI: G_STORE [[LOAD1]](<8 x s16>), [[LOAD]](p1) :: (store 16 into %ir.out, addrspace 1)
400 ; HSA-VI: S_ENDPGM
401 entry:
402 store <8 x i16> %in, <8 x i16> addrspace(1)* %out
403 ret void
404 }
405
406 define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
407 ; HSA-VI-LABEL: name: v8i32_arg
408 ; HSA-VI: bb.1.entry:
409 ; HSA-VI: liveins: $sgpr4_sgpr5
410 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
411 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
412 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
413 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<8 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
414 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
415 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
416 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 32 from `<8 x i32> addrspace(4)* undef`, align 16, addrspace 4)
417 ; HSA-VI: G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store 32 into %ir.out, align 4, addrspace 1)
418 ; HSA-VI: S_ENDPGM
419 entry:
420 store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
421 ret void
422 }
423
424 define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
425 ; HSA-VI-LABEL: name: v8f32_arg
426 ; HSA-VI: bb.1.entry:
427 ; HSA-VI: liveins: $sgpr4_sgpr5
428 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
429 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
430 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
431 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<8 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
432 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
433 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
434 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 32 from `<8 x float> addrspace(4)* undef`, align 16, addrspace 4)
435 ; HSA-VI: G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store 32 into %ir.out, align 4, addrspace 1)
436 ; HSA-VI: S_ENDPGM
437 entry:
438 store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
439 ret void
440 }
441
442 define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
443 ; HSA-VI-LABEL: name: v16i8_arg
444 ; HSA-VI: bb.1.entry:
445 ; HSA-VI: liveins: $sgpr4_sgpr5
446 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
447 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
448 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
449 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<16 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
450 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
451 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
452 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 16 from `<16 x i8> addrspace(4)* undef`, addrspace 4)
453 ; HSA-VI: G_STORE [[LOAD1]](<16 x s8>), [[LOAD]](p1) :: (store 16 into %ir.out, addrspace 1)
454 ; HSA-VI: S_ENDPGM
455 entry:
456 store <16 x i8> %in, <16 x i8> addrspace(1)* %out
457 ret void
458 }
459
460 define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
461 ; HSA-VI-LABEL: name: v16i16_arg
462 ; HSA-VI: bb.1.entry:
463 ; HSA-VI: liveins: $sgpr4_sgpr5
464 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
465 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
466 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
467 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<16 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
468 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
469 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
470 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 32 from `<16 x i16> addrspace(4)* undef`, align 16, addrspace 4)
471 ; HSA-VI: G_STORE [[LOAD1]](<16 x s16>), [[LOAD]](p1) :: (store 32 into %ir.out, addrspace 1)
472 ; HSA-VI: S_ENDPGM
473 entry:
474 store <16 x i16> %in, <16 x i16> addrspace(1)* %out
475 ret void
476 }
477
478 define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
479 ; HSA-VI-LABEL: name: v16i32_arg
480 ; HSA-VI: bb.1.entry:
481 ; HSA-VI: liveins: $sgpr4_sgpr5
482 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
483 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
484 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
485 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<16 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
486 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
487 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
488 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 64 from `<16 x i32> addrspace(4)* undef`, align 16, addrspace 4)
489 ; HSA-VI: G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store 64 into %ir.out, align 4, addrspace 1)
490 ; HSA-VI: S_ENDPGM
491 entry:
492 store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
493 ret void
494 }
495
496 define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
497 ; HSA-VI-LABEL: name: v16f32_arg
498 ; HSA-VI: bb.1.entry:
499 ; HSA-VI: liveins: $sgpr4_sgpr5
500 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
501 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
502 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
503 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<16 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
504 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
505 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
506 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 64 from `<16 x float> addrspace(4)* undef`, align 16, addrspace 4)
507 ; HSA-VI: G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store 64 into %ir.out, align 4, addrspace 1)
508 ; HSA-VI: S_ENDPGM
509 entry:
510 store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
511 ret void
512 }
513
514 define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
515 ; HSA-VI-LABEL: name: kernel_arg_i64
516 ; HSA-VI: bb.1 (%ir-block.0):
517 ; HSA-VI: liveins: $sgpr4_sgpr5
518 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
519 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
520 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
521 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i64 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
522 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
523 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
524 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4)
525 ; HSA-VI: G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
526 ; HSA-VI: S_ENDPGM
527 store i64 %a, i64 addrspace(1)* %out, align 8
528 ret void
529 }
530
531 define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) {
532 ; HSA-VI-LABEL: name: f64_kernel_arg
533 ; HSA-VI: bb.1.entry:
534 ; HSA-VI: liveins: $sgpr4_sgpr5
535 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
536 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
537 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
538 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `double addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
539 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
540 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
541 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `double addrspace(4)* undef`, addrspace 4)
542 ; HSA-VI: G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
543 ; HSA-VI: S_ENDPGM
544 entry:
545 store double %in, double addrspace(1)* %out
546 ret void
547 }
548
549 define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
550 ; HSA-VI-LABEL: name: i1_arg
551 ; HSA-VI: bb.1 (%ir-block.0):
552 ; HSA-VI: liveins: $sgpr4_sgpr5
553 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
554 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
555 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
556 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i1 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
557 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
558 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
559 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
560 ; HSA-VI: G_STORE [[LOAD1]](s1), [[LOAD]](p1) :: (store 1 into %ir.out, addrspace 1)
561 ; HSA-VI: S_ENDPGM
562 store i1 %x, i1 addrspace(1)* %out, align 1
563 ret void
564 }
565
566 define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
567 ; HSA-VI-LABEL: name: i1_arg_zext_i32
568 ; HSA-VI: bb.1 (%ir-block.0):
569 ; HSA-VI: liveins: $sgpr4_sgpr5
570 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
571 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
572 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
573 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
574 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
575 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
576 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
577 ; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s1)
578 ; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
579 ; HSA-VI: S_ENDPGM
580 %ext = zext i1 %x to i32
581 store i32 %ext, i32 addrspace(1)* %out, align 4
582 ret void
583 }
584
585 define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
586 ; HSA-VI-LABEL: name: i1_arg_zext_i64
587 ; HSA-VI: bb.1 (%ir-block.0):
588 ; HSA-VI: liveins: $sgpr4_sgpr5
589 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
590 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
591 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
592 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i64 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
593 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
594 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
595 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
596 ; HSA-VI: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD1]](s1)
597 ; HSA-VI: G_STORE [[ZEXT]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
598 ; HSA-VI: S_ENDPGM
599 %ext = zext i1 %x to i64
600 store i64 %ext, i64 addrspace(1)* %out, align 8
601 ret void
602 }
603
604 define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
605 ; HSA-VI-LABEL: name: i1_arg_sext_i32
606 ; HSA-VI: bb.1 (%ir-block.0):
607 ; HSA-VI: liveins: $sgpr4_sgpr5
608 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
609 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
610 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
611 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
612 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
613 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
614 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
615 ; HSA-VI: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s1)
616 ; HSA-VI: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
617 ; HSA-VI: S_ENDPGM
618 %ext = sext i1 %x to i32
619 store i32 %ext, i32addrspace(1)* %out, align 4
620 ret void
621 }
622
623 define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
624 ; HSA-VI-LABEL: name: i1_arg_sext_i64
625 ; HSA-VI: bb.1 (%ir-block.0):
626 ; HSA-VI: liveins: $sgpr4_sgpr5
627 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
628 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
629 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
630 ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i64 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
631 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
632 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
633 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
634 ; HSA-VI: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD1]](s1)
635 ; HSA-VI: G_STORE [[SEXT]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
636 ; HSA-VI: S_ENDPGM
637 %ext = sext i1 %x to i64
638 store i64 %ext, i64 addrspace(1)* %out, align 8
639 ret void
640 }
641
642 define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
643 ; HSA-VI-LABEL: name: empty_struct_arg
644 ; HSA-VI: bb.1 (%ir-block.0):
645 ; HSA-VI: liveins: $sgpr4_sgpr5
646 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
647 ; HSA-VI: S_ENDPGM
648 ret void
649 }
650
651 ; The correct load offsets for these:
652 ; load 4 from 0,
653 ; load 8 from 8
654 ; load 4 from 24
655 ; load 8 from 32
656
657 ; With the SelectionDAG argument lowering, the alignments for the
658 ; struct members is not properly considered, making these wrong.
659
660 ; FIXME: GlobalISel extractvalue emission broken
661
662 define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
663 ; %val0 = extractvalue {i32, i64} %arg0, 0
664 ; %val1 = extractvalue {i32, i64} %arg0, 1
665 ; %val2 = extractvalue {i32, i64} %arg1, 0
666 ; %val3 = extractvalue {i32, i64} %arg1, 1
667 ; store volatile i32 %val0, i32 addrspace(1)* null
668 ; store volatile i64 %val1, i64 addrspace(1)* null
669 ; store volatile i32 %val2, i32 addrspace(1)* null
670 ; store volatile i64 %val3, i64 addrspace(1)* null
671 ; HSA-VI-LABEL: name: struct_argument_alignment
672 ; HSA-VI: bb.1 (%ir-block.1):
673 ; HSA-VI: liveins: $sgpr4_sgpr5
674 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
675 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
676 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
677 ; HSA-VI: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 16 from `{ i32, i64 } addrspace(4)* undef`, addrspace 4)
678 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
679 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
680 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 16, addrspace 4)
681 ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
682 ; HSA-VI: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64)
683 ; HSA-VI: [[LOAD2:%[0-9]+]]:_(s128) = G_LOAD [[GEP2]](p4) :: (non-temporal invariant load 16 from `{ i32, i64 } addrspace(4)* undef`, align 8, addrspace 4)
684 ; HSA-VI: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD]](s128), 0
685 ; HSA-VI: [[EXTRACT1:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD]](s128), 64
686 ; HSA-VI: [[EXTRACT2:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD2]](s128), 0
687 ; HSA-VI: [[EXTRACT3:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD2]](s128), 64
688 ; HSA-VI: S_ENDPGM
689 ret void
690 }
691
692 ; No padding between i8 and next struct, but round up at end to 4 byte
693 ; multiple.
694 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
695 ; %val0 = extractvalue <{i32, i64}> %arg0, 0
696 ; %val1 = extractvalue <{i32, i64}> %arg0, 1
697 ; %val2 = extractvalue <{i32, i64}> %arg1, 0
698 ; %val3 = extractvalue <{i32, i64}> %arg1, 1
699 ; store volatile i32 %val0, i32 addrspace(1)* null
700 ; store volatile i64 %val1, i64 addrspace(1)* null
701 ; store volatile i32 %val2, i32 addrspace(1)* null
702 ; store volatile i64 %val3, i64 addrspace(1)* null
703 ; HSA-VI-LABEL: name: packed_struct_argument_alignment
704 ; HSA-VI: bb.1 (%ir-block.1):
705 ; HSA-VI: liveins: $sgpr4_sgpr5
706 ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
707 ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
708 ; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
709 ; HSA-VI: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 12 from `<{ i32, i64 }> addrspace(4)* undef`, align 16, addrspace 4)
710 ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
711 ; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
712 ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 4, addrspace 4)
713 ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 13
714 ; HSA-VI: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64)
715 ; HSA-VI: [[LOAD2:%[0-9]+]]:_(s96) = G_LOAD [[GEP2]](p4) :: (non-temporal invariant load 12 from `<{ i32, i64 }> addrspace(4)* undef`, align 1, addrspace 4)
716 ; HSA-VI: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD]](s96), 0
717 ; HSA-VI: [[EXTRACT1:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD]](s96), 32
718 ; HSA-VI: [[EXTRACT2:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD2]](s96), 0
719 ; HSA-VI: [[EXTRACT3:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD2]](s96), 32
720 ; HSA-VI: S_ENDPGM
721 ret void
722 }
1313
1414 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
1515 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
16
17
1816 define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
19 entry:
20 %0 = zext i8 %in to i32
21 store i32 %0, i32 addrspace(1)* %out, align 4
17 %ext = zext i8 %in to i32
18 store i32 %ext, i32 addrspace(1)* %out, align 4
2219 ret void
2320 }
2421
3229 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
3330 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
3431 define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
35 entry:
36 %0 = zext i8 %in to i32
37 store i32 %0, i32 addrspace(1)* %out, align 4
32 %ext = zext i8 %in to i32
33 store i32 %ext, i32 addrspace(1)* %out, align 4
3834 ret void
3935 }
4036
5046 ; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]]
5147 ; HSA-VI: flat_store_dword
5248 define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
53 entry:
54 %0 = sext i8 %in to i32
55 store i32 %0, i32 addrspace(1)* %out, align 4
49 %ext = sext i8 %in to i32
50 store i32 %ext, i32 addrspace(1)* %out, align 4
5651 ret void
5752 }
5853
7065 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
7166 ; HSA-VI: flat_store_dword
7267 define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
73 entry:
74 %0 = zext i16 %in to i32
75 store i32 %0, i32 addrspace(1)* %out, align 4
68 %ext = zext i16 %in to i32
69 store i32 %ext, i32 addrspace(1)* %out, align 4
7670 ret void
7771 }
7872
8882 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
8983 ; HSA-VI: flat_store_dword
9084 define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
91 entry:
92 %0 = zext i16 %in to i32
93 store i32 %0, i32 addrspace(1)* %out, align 4
85 %ext = zext i16 %in to i32
86 store i32 %ext, i32 addrspace(1)* %out, align 4
9487 ret void
9588 }
9689
107100 ; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]]
108101 ; HSA-VI: flat_store_dword
109102 define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
110 entry:
111 %0 = sext i16 %in to i32
112 store i32 %0, i32 addrspace(1)* %out, align 4
103 %ext = sext i16 %in to i32
104 store i32 %ext, i32 addrspace(1)* %out, align 4
113105 ret void
114106 }
115107
656648 store i64 %ext, i64 addrspace(1)* %out, align 8
657649 ret void
658650 }
651
652 ; FUNC-LABEL: {{^}}empty_struct_arg:
653 ; HSA: kernarg_segment_byte_size = 0
654 define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
655 ret void
656 }
657
658 ; The correct load offsets for these:
659 ; load 4 from 0,
660 ; load 8 from 8
661 ; load 4 from 24
662 ; load 8 from 32
663
664 ; With the SelectionDAG argument lowering, the alignments for the
665 ; struct members is not properly considered, making these wrong.
666
667 ; FIXME: Total argument size is computed wrong
668 ; FUNC-LABEL: {{^}}struct_argument_alignment:
669 ; HSA: kernarg_segment_byte_size = 40
670 ; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
671 ; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
672 ; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
673 ; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
674 define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
675 %val0 = extractvalue {i32, i64} %arg0, 0
676 %val1 = extractvalue {i32, i64} %arg0, 1
677 %val2 = extractvalue {i32, i64} %arg1, 0
678 %val3 = extractvalue {i32, i64} %arg1, 1
679 store volatile i32 %val0, i32 addrspace(1)* null
680 store volatile i64 %val1, i64 addrspace(1)* null
681 store volatile i32 %val2, i32 addrspace(1)* null
682 store volatile i64 %val3, i64 addrspace(1)* null
683 ret void
684 }
685
686 ; No padding between i8 and next struct, but round up at end to 4 byte
687 ; multiple.
688 ; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
689 ; HSA: kernarg_segment_byte_size = 28
690 ; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
691 ; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
692 ; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
693 ; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
694 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
695 %val0 = extractvalue <{i32, i64}> %arg0, 0
696 %val1 = extractvalue <{i32, i64}> %arg0, 1
697 %val2 = extractvalue <{i32, i64}> %arg1, 0
698 %val3 = extractvalue <{i32, i64}> %arg1, 1
699 store volatile i32 %val0, i32 addrspace(1)* null
700 store volatile i64 %val1, i64 addrspace(1)* null
701 store volatile i32 %val2, i32 addrspace(1)* null
702 store volatile i64 %val3, i64 addrspace(1)* null
703 ret void
704 }