llvm.org GIT mirror llvm / 3dcb576
Merging r267916: ------------------------------------------------------------------------ r267916 | Matthew.Arsenault | 2016-04-28 11:38:48 -0700 (Thu, 28 Apr 2016) | 6 lines AMDGPU: Fix mishandling array allocations when promoting alloca The canonical form for allocas is a single allocation of the array type. In case we see a non-canonical array alloca, make sure we aren't replacing this with an array N times smaller. ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@271768 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 3 years ago
4 changed file(s) with 73 addition(s) and 19 deletion(s). Raw diff Collapse all Expand all
330330 }
331331
332332 void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
333 if (!I.isStaticAlloca())
333 // Array allocations are probably not worth handling, since an allocation of
334 // the array type is the canonical form.
335 if (!I.isStaticAlloca() || I.isArrayAllocation())
334336 return;
335337
336338 IRBuilder<> Builder(&I);
1414 ; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index
1515 ; with the appropriate offset. We should fold this into the store.
1616 ; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 0, v{{[0-9]+}}
17 ; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}]
17 ; SI-ALLOCA: buffer_store_dword {{v[-1-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
18 ; SI-ALLOCA: s_barrier
19 ; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
1820 ;
1921 ; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
2022 ; alloca to a vector. It currently fails because it does not know how
2123 ; to interpret:
22 ; getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
24 ; getelementptr [16 x i32], [16 x i32]* %alloca, i32 1, i32 %b
2325
24 ; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 16
26 ; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64
2527 ; SI-PROMOTE: ds_write_b32 [[PTRREG]]
26 define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
27 %alloca = alloca [4 x i32], i32 4, align 16
28 define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
29 %alloca = alloca [16 x i32], align 16
2830 %tid = call i32 @llvm.SI.tid() readnone
29 %a_ptr = getelementptr i32, i32 addrspace(1)* %inA, i32 %tid
30 %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid
31 %a_ptr = getelementptr inbounds i32, i32 addrspace(1)* %inA, i32 %tid
32 %b_ptr = getelementptr inbounds i32, i32 addrspace(1)* %inB, i32 %tid
3133 %a = load i32, i32 addrspace(1)* %a_ptr
3234 %b = load i32, i32 addrspace(1)* %b_ptr
3335 %result = add i32 %a, %b
34 %alloca_ptr = getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
36 %alloca_ptr = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 1, i32 %b
3537 store i32 %result, i32* %alloca_ptr, align 4
3638 ; Dummy call
3739 call void @llvm.AMDGPU.barrier.local() nounwind convergent
None ; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=SI -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
22 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
33 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
44
1414 ; SI-PROMOTE: ds_read_b64
1515 define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
1616 %val = load double, double addrspace(1)* %in, align 8
17 %array = alloca double, i32 16, align 8
18 %ptr = getelementptr double, double* %array, i32 %b
17 %array = alloca [16 x double], align 8
18 %ptr = getelementptr inbounds [16 x double], [16 x double]* %array, i32 0, i32 %b
1919 store double %val, double* %ptr, align 8
2020 call void @llvm.AMDGPU.barrier.local() convergent nounwind
2121 %result = load double, double* %ptr, align 8
3434 ; SI-PROMOTE: ds_read_b64
3535 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
3636 %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
37 %array = alloca <2 x double>, i32 16, align 16
38 %ptr = getelementptr <2 x double>, <2 x double>* %array, i32 %b
37 %array = alloca [8 x <2 x double>], align 16
38 %ptr = getelementptr inbounds [8 x <2 x double>], [8 x <2 x double>]* %array, i32 0, i32 %b
3939 store <2 x double> %val, <2 x double>* %ptr, align 16
4040 call void @llvm.AMDGPU.barrier.local() convergent nounwind
4141 %result = load <2 x double>, <2 x double>* %ptr, align 16
5252 ; SI-PROMOTE: ds_read_b64
5353 define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
5454 %val = load i64, i64 addrspace(1)* %in, align 8
55 %array = alloca i64, i32 16, align 8
56 %ptr = getelementptr i64, i64* %array, i32 %b
55 %array = alloca [8 x i64], align 8
56 %ptr = getelementptr inbounds [8 x i64], [8 x i64]* %array, i32 0, i32 %b
5757 store i64 %val, i64* %ptr, align 8
5858 call void @llvm.AMDGPU.barrier.local() convergent nounwind
5959 %result = load i64, i64* %ptr, align 8
7272 ; SI-PROMOTE: ds_read_b64
7373 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
7474 %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
75 %array = alloca <2 x i64>, i32 16, align 16
76 %ptr = getelementptr <2 x i64>, <2 x i64>* %array, i32 %b
75 %array = alloca [8 x <2 x i64>], align 16
76 %ptr = getelementptr inbounds [8 x <2 x i64>], [8 x <2 x i64>]* %array, i32 0, i32 %b
7777 store <2 x i64> %val, <2 x i64>* %ptr, align 16
7878 call void @llvm.AMDGPU.barrier.local() convergent nounwind
7979 %result = load <2 x i64>, <2 x i64>* %ptr, align 16
0 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-promote-alloca < %s | FileCheck %s
1
2 ; Make sure this allocates the correct size if the alloca has a non-0
3 ; number of elements.
4
5 ; CHECK-LABEL: @array_alloca(
6 ; CHECK: %stack = alloca i32, i32 5, align 4
7 define void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
8 entry:
9 %stack = alloca i32, i32 5, align 4
10 %ld0 = load i32, i32 addrspace(1)* %in, align 4
11 %arrayidx1 = getelementptr inbounds i32, i32* %stack, i32 %ld0
12 store i32 4, i32* %arrayidx1, align 4
13 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
14 %ld1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
15 %arrayidx3 = getelementptr inbounds i32, i32* %stack, i32 %ld1
16 store i32 5, i32* %arrayidx3, align 4
17 %arrayidx10 = getelementptr inbounds i32, i32* %stack, i32 0
18 %ld2 = load i32, i32* %arrayidx10, align 4
19 store i32 %ld2, i32 addrspace(1)* %out, align 4
20 %arrayidx12 = getelementptr inbounds i32, i32* %stack, i32 1
21 %ld3 = load i32, i32* %arrayidx12
22 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
23 store i32 %ld3, i32 addrspace(1)* %arrayidx13
24 ret void
25 }
26
27 ; CHECK-LABEL: @array_alloca_dynamic(
28 ; CHECK: %stack = alloca i32, i32 %size, align 4
29 define void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 {
30 entry:
31 %stack = alloca i32, i32 %size, align 4
32 %ld0 = load i32, i32 addrspace(1)* %in, align 4
33 %arrayidx1 = getelementptr inbounds i32, i32* %stack, i32 %ld0
34 store i32 4, i32* %arrayidx1, align 4
35 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
36 %ld1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
37 %arrayidx3 = getelementptr inbounds i32, i32* %stack, i32 %ld1
38 store i32 5, i32* %arrayidx3, align 4
39 %arrayidx10 = getelementptr inbounds i32, i32* %stack, i32 0
40 %ld2 = load i32, i32* %arrayidx10, align 4
41 store i32 %ld2, i32 addrspace(1)* %out, align 4
42 %arrayidx12 = getelementptr inbounds i32, i32* %stack, i32 1
43 %ld3 = load i32, i32* %arrayidx12
44 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
45 store i32 %ld3, i32 addrspace(1)* %arrayidx13
46 ret void
47 }
48
49 attributes #0 = { nounwind }