llvm.org GIT mirror llvm / 2006e62
[AMDGPU] Supported ds_read_b128 generation; Widened vector length for local address-space. Summary: Starting from GCN 2nd generation, ISA supports ds_read_b128 on top of ds_read_b64. This patch supports ds_read_b128 instruction pattern and generation of this instruction. In the vectorizer, this patch also widen the vector length so that vectorizer generates 128 bit loads for local address-space which gets translated to ds_read_b128. Since the performance benefit is not clear; compiler generates ds_read_b128 under -amdgpu-ds128. Author: FarhanaAleen Reviewed By: rampitec, arsenm Subscribers: llvm-commits, AMDGPU Differential Revision: https://reviews.llvm.org/D44210 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@327153 91177308-0d34-0410-b5e6-96231b3b80d8 Farhana Aleen 1 year, 11 months ago
14 changed file(s) with 140 addition(s) and 14 deletion(s). Raw diff Collapse all Expand all
247247 return cast(N)->getAlignment() % 8 == 0;
248248 }]>;
249249
250 class Aligned16Bytes : PatFrag
251 return cast(N)->getAlignment() >= 16;
252 }]>;
253
250254 class LoadFrag : PatFrag<(ops node:$ptr), (op node:$ptr)>;
251255
252256 class StoreFrag : PatFrag <
367371 def truncstorei8_local_hi16 : StoreHi16, LocalAddress;
368372
369373 def load_align8_local : Aligned8Bytes <
374 (ops node:$ptr), (load_local node:$ptr)
375 >;
376
377 def load_align16_local : Aligned16Bytes <
370378 (ops node:$ptr), (load_local node:$ptr)
371379 >;
372380
413413 return FlatForGlobal;
414414 }
415415
416 /// \returns If target supports ds_read/write_b128 and user enables generation
417 /// of ds_read/write_b128.
418 bool useDS128(bool UserEnable) const {
419 return CIInsts && UserEnable;
420 }
421
416422 /// \returns If MUBUF instructions always perform range checking, even for
417423 /// buffer resources used for private memory access.
418424 bool privateMemoryResourceIsRangeChecked() const {
264264 return 512;
265265 }
266266
267 if (AddrSpace == AS.FLAT_ADDRESS)
267 if (AddrSpace == AS.FLAT_ADDRESS ||
268 AddrSpace == AS.LOCAL_ADDRESS ||
269 AddrSpace == AS.REGION_ADDRESS)
268270 return 128;
269 if (AddrSpace == AS.LOCAL_ADDRESS ||
270 AddrSpace == AS.REGION_ADDRESS)
271 return 64;
271
272272 if (AddrSpace == AS.PRIVATE_ADDRESS)
273273 return 8 * ST->getMaxPrivateElementSize();
274274
648648 let AddedComplexity = 100 in {
649649
650650 defm : DSReadPat_mc ;
651 defm : DSReadPat_mc ;
651652
652653 } // End AddedComplexity = 100
653654
9393 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
9494 cl::init(false));
9595
96 static cl::opt EnableDS128(
97 "amdgpu-ds128",
98 cl::desc("Use DS_read/write_b128"),
99 cl::init(false));
100
96101 static cl::opt AssumeFrameIndexHighZeroBits(
97102 "amdgpu-frame-index-zero-bits",
98103 cl::desc("High bits of frame index assumed to be zero"),
54245429 llvm_unreachable("unsupported private_element_size");
54255430 }
54265431 } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
5432 // Use ds_read_b128 if possible.
5433 if (Subtarget->useDS128(EnableDS128) && Load->getAlignment() >= 16 &&
5434 MemVT.getStoreSize() == 16)
5435 return SDValue();
5436
54275437 if (NumElements > 2)
54285438 return SplitVectorLoad(Op, DAG);
5429
5430 if (NumElements == 2)
5431 return SDValue();
5432
5433 // If properly aligned, if we split we might be able to use ds_read_b64.
5434 return SplitVectorLoad(Op, DAG);
54355439 }
54365440 return SDValue();
54375441 }
409409 def load_glue_align8 : Aligned8Bytes <
410410 (ops node:$ptr), (load_glue node:$ptr)
411411 >;
412 def load_glue_align16 : Aligned16Bytes <
413 (ops node:$ptr), (load_glue node:$ptr)
414 >;
412415
413416
414417 def load_local_m0 : LoadFrag, LocalAddress;
417420 def az_extloadi8_local_m0 : LoadFrag, LocalAddress;
418421 def az_extloadi16_local_m0 : LoadFrag, LocalAddress;
419422 def load_align8_local_m0 : LoadFrag , LocalAddress;
423 def load_align16_local_m0 : LoadFrag , LocalAddress;
420424
421425
422426 def AMDGPUst_glue : SDNode <"ISD::STORE", SDTStore,
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
11 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
22 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
3
4 ; Testing for ds_read_128
5 ; RUN: llc -march=amdgcn -mcpu=tahiti -amdgpu-ds128 < %s | FileCheck -check-prefixes=SI,FUNC %s
6 ; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
7 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
38
49 ; FUNC-LABEL: {{^}}load_f32_local:
510 ; SICIVI: s_mov_b32 m0
121126 ret void
122127 }
123128
129 ; Tests if ds_read_b128 gets generated for the 16 byte aligned load.
130 ; FUNC-LABEL: {{^}}local_v4f32_to_128:
131 ; SI-NOT: ds_read_b128
132 ; CIVI: ds_read_b128
133 ; EG: LDS_READ_RET
134 ; EG: LDS_READ_RET
135 ; EG: LDS_READ_RET
136 ; EG: LDS_READ_RET
137 define amdgpu_kernel void @local_v4f32_to_128(<4 x float> addrspace(3)* %out, <4 x float> addrspace(3)* %in) {
138 %ld = load <4 x float>, <4 x float> addrspace(3)* %in, align 16
139 store <4 x float> %ld, <4 x float> addrspace(3)* %out
140 ret void
141 }
142
124143 attributes #0 = { nounwind }
22 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
33 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
44 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
5
6 ; Testing for ds_read_b128
7 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -amdgpu-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
59
610 ; FUNC-LABEL: {{^}}local_load_f64:
711 ; SICIV: s_mov_b32 m0
169173 ret void
170174 }
171175
176 ; Tests if ds_read_b128 gets generated for the 16 byte aligned load.
177 ; FUNC-LABEL: {{^}}local_load_v2f64_to_128:
178 ; CIVI: ds_read_b128
179 ; EG: LDS_READ_RET
180 ; EG: LDS_READ_RET
181 ; EG: LDS_READ_RET
182 ; EG: LDS_READ_RET
183 define amdgpu_kernel void @local_load_v2f64_to_128(<2 x double> addrspace(3)* %out, <2 x double> addrspace(3)* %in) {
184 entry:
185 %ld = load <2 x double>, <2 x double> addrspace(3)* %in, align 16
186 store <2 x double> %ld, <2 x double> addrspace(3)* %out
187 ret void
188 }
189
172190 attributes #0 = { nounwind }
11 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,GFX89,FUNC %s
22 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX89,FUNC %s
33 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
4
5 ; Testing for ds_read_b128
6 ; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
7 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
48
59 ; FUNC-LABEL: {{^}}local_load_i16:
610 ; GFX9-NOT: m0
934938 ; ret void
935939 ; }
936940
941 ; Tests if ds_read_b128 gets generated for the 16 byte aligned load.
942 ; FUNC-LABEL: {{^}}local_v8i16_to_128:
943 ; SI-NOT: ds_read_b128
944 ; CIVI: ds_read_b128
945 ; EG: LDS_READ_RET
946 ; EG: LDS_READ_RET
947 ; EG: LDS_READ_RET
948 ; EG: LDS_READ_RET
949 define amdgpu_kernel void @local_v8i16_to_128(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
950 %ld = load <8 x i16>, <8 x i16> addrspace(3)* %in, align 16
951 store <8 x i16> %ld, <8 x i16> addrspace(3)* %out
952 ret void
953 }
954
937955 attributes #0 = { nounwind }
11 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
22 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
33 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
4
5 ; Testing for ds_read_128
6 ; RUN: llc -march=amdgcn -mcpu=tahiti -amdgpu-ds128 < %s | FileCheck -check-prefixes=SI,FUNC %s
7 ; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
49
510 ; FUNC-LABEL: {{^}}local_load_i32:
611 ; GCN-NOT: s_wqm_b64
174179 ret void
175180 }
176181
182 ; Tests if ds_read_b128 gets generated for the 16 byte aligned load.
183 ; FUNC-LABEL: {{^}}local_v4i32_to_128:
184 ; SI-NOT: ds_read_b128
185 ; CIVI: ds_read_b128
186 ; EG: LDS_READ_RET
187 ; EG: LDS_READ_RET
188 ; EG: LDS_READ_RET
189 ; EG: LDS_READ_RET
190 define amdgpu_kernel void @local_v4i32_to_128(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) {
191 %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 16
192 store <4 x i32> %ld, <4 x i32> addrspace(3)* %out
193 ret void
194 }
195
177196 ; FUNC-LABEL: {{^}}local_zextload_v8i32_to_v8i64:
178197 ; SICIVI: s_mov_b32 m0, -1
179198 ; GFX9-NOT: m0
22 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
33 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
44 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
5
6 ; Testing for ds_read_b128
7 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -amdgpu-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
59
610 ; FUNC-LABEL: {{^}}local_load_i64:
711 ; SICIVI: s_mov_b32 m0
2933 ; EG: LDS_READ_RET
3034 ; EG: LDS_READ_RET
3135 define amdgpu_kernel void @local_load_v2i64(<2 x i64> addrspace(3)* %out, <2 x i64> addrspace(3)* %in) #0 {
36 entry:
37 %ld = load <2 x i64>, <2 x i64> addrspace(3)* %in
38 store <2 x i64> %ld, <2 x i64> addrspace(3)* %out
39 ret void
40 }
41
42 ; Tests if ds_read_b128 gets generated for the 16 byte aligned load.
43 ; FUNC-LABEL: {{^}}local_load_v2i64_to_128:
44 ; CIVI: ds_read_b128
45 define amdgpu_kernel void @local_load_v2i64_to_128(<2 x i64> addrspace(3)* %out, <2 x i64> addrspace(3)* %in) {
3246 entry:
3347 %ld = load <2 x i64>, <2 x i64> addrspace(3)* %in
3448 store <2 x i64> %ld, <2 x i64> addrspace(3)* %out
22 ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
33 ; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
44
5 ; Testing for ds_read_b128
6 ; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
7 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-ds128 < %s | FileCheck -check-prefixes=CIVI,FUNC %s
58
69 ; FUNC-LABEL: {{^}}local_load_i8:
710 ; GCN-NOT: s_wqm_b64
10201023 ; ret void
10211024 ; }
10221025
1026 ; Tests if ds_read_b128 gets generated for the 16 byte aligned load.
1027 ; FUNC-LABEL: {{^}}local_v16i8_to_128:
1028 ; SI-NOT: ds_read_b128
1029 ; CIVI: ds_read_b128
1030 ; EG: LDS_READ_RET
1031 ; EG: LDS_READ_RET
1032 ; EG: LDS_READ_RET
1033 ; EG: LDS_READ_RET
1034 define amdgpu_kernel void @local_v16i8_to_128(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) {
1035 %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 16
1036 store <16 x i8> %ld, <16 x i8> addrspace(3)* %out
1037 ret void
1038 }
1039
10231040 attributes #0 = { nounwind }
503503 }
504504
505505 ; CHECK-LABEL: @merge_local_store_4_constants_i32
506 ; CHECK: store <2 x i32> , <2 x i32> addrspace(3)*
507 ; CHECK: store <2 x i32> , <2 x i32> addrspace(3)*
506 ; CHECK: store <4 x i32> , <4 x i32> addrspace(3)*
508507 define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
509508 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
510509 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
2828 ; longest chain vectorized
2929
3030 ; CHECK-LABEL: @interleave_get_longest
31 ; CHECK: load <2 x i32>
31 ; CHECK: load <4 x i32>
3232 ; CHECK: load i32
3333 ; CHECK: store <2 x i32> zeroinitializer
3434 ; CHECK: load i32
35 ; CHECK: load <2 x i32>
3635 ; CHECK: load i32
3736 ; CHECK: load i32
3837