llvm.org GIT mirror llvm / 0db0961
[AMDGPU] Extend promote alloca vectorization Promote alloca can vectorize a small array by bitcasting it to a vector type. Extend vectorization for the case when alloca is already a vector type. We still want to replace GEPs with an insert/extract element instructions in this case. Differential Revision: https://reviews.llvm.org/D54219 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346376 91177308-0d34-0410-b5e6-96231b3b80d8 Stanislav Mekhanoshin 10 months ago
2 changed file(s) with 209 addition(s) and 4 deletion(s). Raw diff Collapse all Expand all
327327 // Currently only handle the case where the Pointer Operand is a GEP.
328328 // Also we could not vectorize volatile or atomic loads.
329329 LoadInst *LI = cast(Inst);
330 if (isa(User) &&
331 LI->getPointerOperandType() == User->getType() &&
332 isa(LI->getType()))
333 return true;
330334 return isa(LI->getPointerOperand()) && LI->isSimple();
331335 }
332336 case Instruction::BitCast:
336340 // since it should be canonical form, the User should be a GEP.
337341 // Also we could not vectorize volatile or atomic stores.
338342 StoreInst *SI = cast(Inst);
343 if (isa(User) &&
344 SI->getPointerOperandType() == User->getType() &&
345 isa(SI->getValueOperand()->getType()))
346 return true;
339347 return (SI->getPointerOperand() == User) && isa(User) && SI->isSimple();
340348 }
341349 default:
350358 return false;
351359 }
352360
353 ArrayType *AllocaTy = dyn_cast(Alloca->getAllocatedType());
361 Type *AT = Alloca->getAllocatedType();
362 SequentialType *AllocaTy = dyn_cast(AT);
354363
355364 LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
356365
397406 }
398407 }
399408
400 VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
409 VectorType *VectorTy = dyn_cast(AllocaTy);
410 if (!VectorTy)
411 VectorTy = arrayTypeToVecType(cast(AllocaTy));
401412
402413 LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
403414 << *VectorTy << '\n');
407418 IRBuilder<> Builder(Inst);
408419 switch (Inst->getOpcode()) {
409420 case Instruction::Load: {
421 if (Inst->getType() == AT)
422 break;
423
410424 Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
411425 Value *Ptr = cast(Inst)->getPointerOperand();
412426 Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
419433 break;
420434 }
421435 case Instruction::Store: {
436 StoreInst *SI = cast(Inst);
437 if (SI->getValueOperand()->getType() == AT)
438 break;
439
422440 Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
423
424 StoreInst *SI = cast(Inst);
425441 Value *Ptr = SI->getPointerOperand();
426442 Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
427443 Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
0 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8 %s
1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
2 ; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
3
4 ; GCN-LABEL: {{^}}float4_alloca_store4:
5 ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4
6
7 ; GFX-NOT: buffer_
8 ; GCN: v_readfirstlane_b32
9 ; GFX8: v_movrels_b32
10 ; GFX9: s_set_gpr_idx_on
11 ; GFX9: s_set_gpr_idx_off
12
13 ; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
14 ; OPT: store <4 x float> , <4 x float> addrspace(5)* %alloca, align 4
15 ; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
16 ; OPT: %1 = extractelement <4 x float> %0, i32 %sel2
17 ; OPT: store float %1, float addrspace(1)* %out, align 4
18
19 define amdgpu_kernel void @float4_alloca_store4(float addrspace(1)* %out, float addrspace(3)* %dummy_lds) {
20 entry:
21 %alloca = alloca <4 x float>, align 16, addrspace(5)
22 %x = tail call i32 @llvm.amdgcn.workitem.id.x()
23 %y = tail call i32 @llvm.amdgcn.workitem.id.y()
24 %c1 = icmp uge i32 %x, 3
25 %c2 = icmp uge i32 %y, 3
26 %sel1 = select i1 %c1, i32 1, i32 2
27 %sel2 = select i1 %c2, i32 0, i32 %sel1
28 %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
29 store <4 x float> , <4 x float> addrspace(5)* %alloca, align 4
30 %load = load float, float addrspace(5)* %gep, align 4
31 store float %load, float addrspace(1)* %out, align 4
32 ret void
33 }
34
35 ; GCN-LABEL: {{^}}float4_alloca_load4:
36 ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_load4
37
38 ; GFX-NOT: buffer_
39 ; GCN: v_readfirstlane_b32
40 ; GFX8: v_movreld_b32
41 ; GFX9: s_set_gpr_idx_on
42 ; GFX9: s_set_gpr_idx_off
43
44 ; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
45 ; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
46 ; OPT: %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 %sel2
47 ; OPT: store <4 x float> %1, <4 x float> addrspace(5)* %alloca
48 ; OPT: %load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4
49 ; OPT: store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4
50
51 define amdgpu_kernel void @float4_alloca_load4(<4 x float> addrspace(1)* %out, float addrspace(3)* %dummy_lds) {
52 entry:
53 %alloca = alloca <4 x float>, align 16, addrspace(5)
54 %x = tail call i32 @llvm.amdgcn.workitem.id.x()
55 %y = tail call i32 @llvm.amdgcn.workitem.id.y()
56 %c1 = icmp uge i32 %x, 3
57 %c2 = icmp uge i32 %y, 3
58 %sel1 = select i1 %c1, i32 1, i32 2
59 %sel2 = select i1 %c2, i32 0, i32 %sel1
60 %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
61 store float 1.0, float addrspace(5)* %gep, align 4
62 %load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4
63 store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4
64 ret void
65 }
66
67 ; GCN-LABEL: {{^}}half4_alloca_store4:
68 ; OPT-LABEL: define amdgpu_kernel void @half4_alloca_store4
69
70 ; GFX-NOT: buffer_
71 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200
72 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
73 ; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]
74
75 ; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
76 ; OPT: store <4 x half> , <4 x half> addrspace(5)* %alloca, align 2
77 ; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
78 ; OPT: %1 = extractelement <4 x half> %0, i32 %sel2
79 ; OPT: store half %1, half addrspace(1)* %out, align 2
80
81 define amdgpu_kernel void @half4_alloca_store4(half addrspace(1)* %out, half addrspace(3)* %dummy_lds) {
82 entry:
83 %alloca = alloca <4 x half>, align 16, addrspace(5)
84 %x = tail call i32 @llvm.amdgcn.workitem.id.x()
85 %y = tail call i32 @llvm.amdgcn.workitem.id.y()
86 %c1 = icmp uge i32 %x, 3
87 %c2 = icmp uge i32 %y, 3
88 %sel1 = select i1 %c1, i32 1, i32 2
89 %sel2 = select i1 %c2, i32 0, i32 %sel1
90 %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
91 store <4 x half> , <4 x half> addrspace(5)* %alloca, align 2
92 %load = load half, half addrspace(5)* %gep, align 2
93 store half %load, half addrspace(1)* %out, align 2
94 ret void
95 }
96
97 ; GCN-LABEL: {{^}}half4_alloca_load4:
98 ; OPT-LABEL: define amdgpu_kernel void @half4_alloca_load4
99
100 ; GFX-NOT: buffer_
101 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
102 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff
103
104 ; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
105 ; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
106 ; OPT: %1 = insertelement <4 x half> %0, half 0xH3C00, i32 %sel2
107 ; OPT: store <4 x half> %1, <4 x half> addrspace(5)* %alloca
108 ; OPT: %load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2
109 ; OPT: store <4 x half> %load, <4 x half> addrspace(1)* %out, align 2
110
111 define amdgpu_kernel void @half4_alloca_load4(<4 x half> addrspace(1)* %out, half addrspace(3)* %dummy_lds) {
112 entry:
113 %alloca = alloca <4 x half>, align 16, addrspace(5)
114 %x = tail call i32 @llvm.amdgcn.workitem.id.x()
115 %y = tail call i32 @llvm.amdgcn.workitem.id.y()
116 %c1 = icmp uge i32 %x, 3
117 %c2 = icmp uge i32 %y, 3
118 %sel1 = select i1 %c1, i32 1, i32 2
119 %sel2 = select i1 %c2, i32 0, i32 %sel1
120 %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
121 store half 1.0, half addrspace(5)* %gep, align 4
122 %load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2
123 store <4 x half> %load, <4 x half> addrspace(1)* %out, align 2
124 ret void
125 }
126
127 ; GCN-LABEL: {{^}}short4_alloca_store4:
128 ; OPT-LABEL: define amdgpu_kernel void @short4_alloca_store4
129
130 ; GFX-NOT: buffer_
131 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x40003
132 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001
133 ; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]
134
135 ; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
136 ; OPT: store <4 x i16> , <4 x i16> addrspace(5)* %alloca, align 2
137 ; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca
138 ; OPT: %1 = extractelement <4 x i16> %0, i32 %sel2
139 ; OPT: store i16 %1, i16 addrspace(1)* %out, align 2
140
141 define amdgpu_kernel void @short4_alloca_store4(i16 addrspace(1)* %out, i16 addrspace(3)* %dummy_lds) {
142 entry:
143 %alloca = alloca <4 x i16>, align 16, addrspace(5)
144 %x = tail call i32 @llvm.amdgcn.workitem.id.x()
145 %y = tail call i32 @llvm.amdgcn.workitem.id.y()
146 %c1 = icmp uge i32 %x, 3
147 %c2 = icmp uge i32 %y, 3
148 %sel1 = select i1 %c1, i32 1, i32 2
149 %sel2 = select i1 %c2, i32 0, i32 %sel1
150 %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
151 store <4 x i16> , <4 x i16> addrspace(5)* %alloca, align 2
152 %load = load i16, i16 addrspace(5)* %gep, align 2
153 store i16 %load, i16 addrspace(1)* %out, align 2
154 ret void
155 }
156
157 ; GCN-LABEL: {{^}}short4_alloca_load4:
158 ; OPT-LABEL: define amdgpu_kernel void @short4_alloca_load4
159
160 ; GFX-NOT: buffer_
161 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
162 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff
163
164 ; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
165 ; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca
166 ; OPT: %1 = insertelement <4 x i16> %0, i16 1, i32 %sel2
167 ; OPT: store <4 x i16> %1, <4 x i16> addrspace(5)* %alloca
168 ; OPT: %load = load <4 x i16>, <4 x i16> addrspace(5)* %alloca, align 2
169 ; OPT: store <4 x i16> %load, <4 x i16> addrspace(1)* %out, align 2
170
171 define amdgpu_kernel void @short4_alloca_load4(<4 x i16> addrspace(1)* %out, i16 addrspace(3)* %dummy_lds) {
172 entry:
173 %alloca = alloca <4 x i16>, align 16, addrspace(5)
174 %x = tail call i32 @llvm.amdgcn.workitem.id.x()
175 %y = tail call i32 @llvm.amdgcn.workitem.id.y()
176 %c1 = icmp uge i32 %x, 3
177 %c2 = icmp uge i32 %y, 3
178 %sel1 = select i1 %c1, i32 1, i32 2
179 %sel2 = select i1 %c2, i32 0, i32 %sel1
180 %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
181 store i16 1, i16 addrspace(5)* %gep, align 4
182 %load = load <4 x i16>, <4 x i16> addrspace(5)* %alloca, align 2
183 store <4 x i16> %load, <4 x i16> addrspace(1)* %out, align 2
184 ret void
185 }
186
187 declare i32 @llvm.amdgcn.workitem.id.x()
188 declare i32 @llvm.amdgcn.workitem.id.y()