llvm.org GIT mirror llvm / 626ceb2
AMDGPU: Prepare for reducing private element size. Tests for the new scalarize all private access options will be included with a future commit. The only functional change is to make the split/scalarize behavior for private access of > 4 element vectors to be consistent with the flat/global handling. This makes the spilling worse in the two changed tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@260804 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 4 years ago
4 changed file(s) with 180 addition(s) and 17 deletion(s). Raw diff Collapse all Expand all
16921692 //
16931693 // Fall-through
16941694 case AMDGPUAS::GLOBAL_ADDRESS:
1695 case AMDGPUAS::PRIVATE_ADDRESS:
1696 if (NumElements >= 8)
1695 case AMDGPUAS::FLAT_ADDRESS:
1696 if (NumElements > 4)
16971697 return SplitVectorLoad(Op, DAG);
16981698 // v4 loads are supported for private and global memory.
16991699 return SDValue();
1700 case AMDGPUAS::PRIVATE_ADDRESS: {
1701 // Depending on the setting of the private_element_size field in the
1702 // resource descriptor, we can only make private accesses up to a certain
1703 // size.
1704 switch (Subtarget->getMaxPrivateElementSize()) {
1705 case 4:
1706 return ScalarizeVectorLoad(Op, DAG);
1707 case 8:
1708 if (NumElements > 2)
1709 return SplitVectorLoad(Op, DAG);
1710 return SDValue();
1711 case 16:
1712 // Same as global/flat
1713 if (NumElements > 4)
1714 return SplitVectorLoad(Op, DAG);
1715 return SDValue();
1716 default:
1717 llvm_unreachable("unsupported private_element_size");
1718 }
1719 }
17001720 case AMDGPUAS::LOCAL_ADDRESS:
17011721 // If properly aligned, if we split we might be able to use ds_read_b64.
17021722 return SplitVectorLoad(Op, DAG);
19061926
19071927 assert(Store->getValue().getValueType().getScalarType() == MVT::i32);
19081928
1909 unsigned NElts = VT.getVectorNumElements();
1910 unsigned AS = Store->getAddressSpace();
1911 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
1929 unsigned NumElements = VT.getVectorNumElements();
1930 switch (Store->getAddressSpace()) {
1931 case AMDGPUAS::GLOBAL_ADDRESS:
1932 case AMDGPUAS::FLAT_ADDRESS:
1933 if (NumElements > 4)
1934 return SplitVectorStore(Op, DAG);
1935 return SDValue();
1936 case AMDGPUAS::PRIVATE_ADDRESS: {
1937 switch (Subtarget->getMaxPrivateElementSize()) {
1938 case 4:
1939 return ScalarizeVectorStore(Op, DAG);
1940 case 8:
1941 if (NumElements > 2)
1942 return SplitVectorStore(Op, DAG);
1943 return SDValue();
1944 case 16:
1945 if (NumElements > 4)
1946 return SplitVectorStore(Op, DAG);
1947 return SDValue();
1948 default:
1949 llvm_unreachable("unsupported private_element_size");
1950 }
1951 }
1952 case AMDGPUAS::LOCAL_ADDRESS:
19121953 // If properly aligned, if we split we might be able to use ds_write_b64.
19131954 return SplitVectorStore(Op, DAG);
1914 }
1915
1916 if (AS == AMDGPUAS::PRIVATE_ADDRESS && NElts > 4)
1917 return ScalarizeVectorStore(Op, DAG);
1918
1919 // These stores are legal. private, global and flat.
1920 if (NElts >= 8)
1921 return SplitVectorStore(Op, DAG);
1922
1923 return SDValue();
1955 default:
1956 llvm_unreachable("unhandled address space");
1957 }
19241958 }
19251959
19261960 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
0 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT16 -check-prefix=HSA -check-prefix=HSA-ELT16 -check-prefix=ALL %s
1 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-8 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT8 -check-prefix=HSA -check-prefix=HSA-ELT8 -check-prefix=ALL %s
2 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT4 -check-prefix=HSA -check-prefix=HSA-ELT4 -check-prefix=ALL %s
3
4
5 ; ALL-LABEL: {{^}}private_elt_size_v4i32:
6
7 ; HSA-ELT16: private_element_size = 3
8 ; HSA-ELT8: private_element_size = 2
9 ; HSA-ELT4: private_element_size = 1
10
11
12 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
13 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
14 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
15
16 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
17 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8
18 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
19 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24
20
21 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
22 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
23
24
25 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
26 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
27 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
28 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
29 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}
30 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
31 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
32 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
33
34 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
35 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
36 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
37 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
38 define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
39 entry:
40 %tid = call i32 @llvm.amdgcn.workitem.id.x()
41 %idxprom = sext i32 %tid to i64
42 %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
43 %index.load = load i32, i32 addrspace(1)* %gep.index
44 %index = and i32 %index.load, 2
45 %alloca = alloca [2 x <4 x i32>], align 16
46 %gep0 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i32 0, i32 0
47 %gep1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i32 0, i32 1
48 store <4 x i32> zeroinitializer, <4 x i32>* %gep0
49 store <4 x i32> , <4 x i32>* %gep1
50 %idxprom2 = sext i32 %index to i64
51 %gep2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i64 0, i64 %idxprom2
52 %load = load <4 x i32>, <4 x i32>* %gep2
53 store <4 x i32> %load, <4 x i32> addrspace(1)* %out
54 ret void
55 }
56
57 ; ALL-LABEL: {{^}}private_elt_size_v8i32:
58 ; HSA-ELT16: private_element_size = 3
59 ; HSA-ELT8: private_element_size = 2
60 ; HSA-ELT4: private_element_size = 1
61
62 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
63 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
64 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:32
65 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:48
66
67 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
68 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
69
70
71 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
72 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8
73 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16
74 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24
75 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:32
76 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:40
77 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:48
78 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen offset:56
79
80 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
81 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
82
83
84 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
85 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
86 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
87 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
88 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}
89 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
90 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
91 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
92 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:32{{$}}
93 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:36{{$}}
94 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:40{{$}}
95 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:44{{$}}
96 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:48{{$}}
97 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:52{{$}}
98 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:56{{$}}
99 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:60{{$}}
100
101 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
102 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
103 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
104 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
105 define void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
106 entry:
107 %tid = call i32 @llvm.amdgcn.workitem.id.x()
108 %idxprom = sext i32 %tid to i64
109 %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
110 %index.load = load i32, i32 addrspace(1)* %gep.index
111 %index = and i32 %index.load, 2
112 %alloca = alloca [2 x <8 x i32>], align 16
113 %gep0 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i32 0, i32 0
114 %gep1 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i32 0, i32 1
115 store <8 x i32> zeroinitializer, <8 x i32>* %gep0
116 store <8 x i32> , <8 x i32>* %gep1
117 %idxprom2 = sext i32 %index to i64
118 %gep2 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i64 0, i64 %idxprom2
119 %load = load <8 x i32>, <8 x i32>* %gep2
120 store <8 x i32> %load, <8 x i32> addrspace(1)* %out
121 ret void
122 }
123
124 declare i32 @llvm.amdgcn.workitem.id.x() #1
125
126 attributes #0 = { nounwind }
127 attributes #1 = { nounwind readnone }
2828
2929 ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill
3030
31 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
31 ; GCN: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
3232 ; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
3333
3434 ; GCN: NumVgprs: 256
1717 ; VI-NEXT: s_mov_b32 s15, 0x980000
1818
1919 ; s12 is offset user SGPR
20 ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill
20 ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Spill
21 ; GCN: buffer_load_dword v{{[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Reload
2122
2223 ; GCN: NumVgprs: 256
2324 ; GCN: ScratchSize: 1024