llvm.org GIT mirror llvm / 47b8de0
[AMDGPU] gfx10 tests. NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363946 91177308-0d34-0410-b5e6-96231b3b80d8 Stanislav Mekhanoshin a month ago
21 changed file(s) with 235367 addition(s) and 191 deletion(s). Raw diff Collapse all Expand all
0 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
2
3 ; GNC-LABEL: {{^}}test_add_lit:
4 ; GFX10: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, 0x80992bff, v{{[0-9]+}}
5 ; GFX10: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0xe7, v{{[0-9]+}}, vcc_lo
6 ; GFX9: v_mov_b32_e32 [[C2:v[0-9]+]], 0xe7
7 ; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0x80992bff, v{{[0-9]+}}
8 ; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, [[C2]], vcc
9 define amdgpu_kernel void @test_add_lit(i64 addrspace(1)* %p) {
10 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
11 %ptr = getelementptr inbounds i64, i64 addrspace(1)* %p, i32 %id
12 %load = load i64, i64 addrspace(1)* %ptr, align 8
13 %add = add nsw i64 %load, 994294967295
14 store i64 %add, i64 addrspace(1)* %ptr, align 8
15 ret void
16 }
17
18 ; GNC-LABEL: {{^}}test_cndmask_lit:
19 ; GFX10: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3039, v{{[0-9]+}}, vcc_lo
20 ; GFX9: v_mov_b32_e32 [[C:v[0-9]+]], 0x3039
21 ; GFX9: v_cndmask_b32_e32 v{{[0-9]+}}, [[C]], v{{[0-9]+}}, vcc
22 define amdgpu_kernel void @test_cndmask_lit(i32 addrspace(1)* %p) {
23 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
24 %n = add nuw nsw i32 %id, 1
25 %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %id
26 %v1 = load i32, i32 addrspace(1)* %p1, align 4
27 %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %n
28 %v2 = load i32, i32 addrspace(1)* %p2, align 4
29 %cmp = icmp sgt i32 %v1, 0
30 %sel = select i1 %cmp, i32 12345, i32 %v2
31 store i32 %sel, i32 addrspace(1)* %p1, align 4
32 ret void
33 }
34
35 ; GCN-LABEL: {{^}}test_bfe_2lit_s:
36 ; GFX10: v_mov_b32_e32 [[C1:v[0-9]+]], 0xddd5
37 ; GFX10: v_bfe_u32 v{{[0-9]+}}, 0x3039, s{{[0-9]+}}, [[C1]]
38 ; GFX9-DAG: v_mov_b32_e32 [[C2:v[0-9]+]], 0xddd5
39 ; GFX9-DAG: s_movk_i32 [[C1:s[0-9]+]], 0x3039
40 ; GFX9: v_bfe_u32 v{{[0-9]+}}, [[C1]], v{{[0-9]+}}, [[C2]]
41 define amdgpu_kernel void @test_bfe_2lit_s(i32 addrspace(1)* %p, i32 %src) {
42 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 12345, i32 %src, i32 56789)
43 store i32 %bfe, i32 addrspace(1)* %p, align 4
44 ret void
45 }
46
47 ; GCN-LABEL: {{^}}test_bfe_2lit_v:
48 ; GFX10: s_movk_i32 [[C1:s[0-9]+]], 0x3039
49 ; GFX10: v_bfe_u32 v{{[0-9]+}}, [[C1]], v{{[0-9]+}}, 0xddd5
50 ; GFX9-DAG: v_mov_b32_e32 [[C2:v[0-9]+]], 0xddd5
51 ; GFX9-DAG: s_movk_i32 [[C1:s[0-9]+]], 0x3039
52 ; GFX9: v_bfe_u32 v{{[0-9]+}}, [[C1]], v{{[0-9]+}}, [[C2]]
53 define amdgpu_kernel void @test_bfe_2lit_v(i32 addrspace(1)* %p) {
54 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
55 %ptr = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %id
56 %load = load i32, i32 addrspace(1)* %ptr, align 4
57 %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 12345, i32 %load, i32 56789)
58 store i32 %bfe, i32 addrspace(1)* %ptr, align 4
59 ret void
60 }
61
62 declare i32 @llvm.amdgcn.workitem.id.x()
63 declare i32 @llvm.amdgcn.ubfe.i32(i32, i32, i32)
11 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3,-flat-for-global | FileCheck --check-prefix=HSA-CI %s
22 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3 | FileCheck --check-prefix=HSA %s
33 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-code-object-v3,-flat-for-global | FileCheck --check-prefix=HSA-VI %s
4 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj -mattr=-code-object-v3 | llvm-readobj --symbols -S --sd | FileCheck --check-prefix=ELF %s
5 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | llvm-readobj --symbols -S --sd | FileCheck %s --check-prefix=ELF
4 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj -mattr=-code-object-v3 | llvm-readobj -symbols -s -sd | FileCheck --check-prefix=ELF %s
5 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 | llvm-readobj -symbols -s -sd | FileCheck %s --check-prefix=ELF
6 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+WavefrontSize32,-WavefrontSize64,-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=GFX10 --check-prefix=GFX10-W32 %s
7 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-WavefrontSize32,+WavefrontSize64,-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=GFX10 --check-prefix=GFX10-W64 %s
68
79 ; The SHT_NOTE section contains the output from the .hsa_code_object_*
810 ; directives.
4446 ; HSA: .amd_kernel_code_t
4547 ; HSA: enable_sgpr_private_segment_buffer = 1
4648 ; HSA: enable_sgpr_kernarg_segment_ptr = 1
47 ; HSA: wavefront_size = 6
49
50 ; PRE-GFX10: enable_wavefront_size32 = 0
51 ; GFX10-W32: enable_wavefront_size32 = 1
52 ; GFX10-W64: enable_wavefront_size32 = 0
53
54 ; PRE-GFX10: wavefront_size = 6
55 ; GFX10-W32: wavefront_size = 5
56 ; GFX10-W64: wavefront_size = 6
57
4858 ; HSA: call_convention = -1
4959 ; HSA: .end_amd_kernel_code_t
5060 ; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
5464 ; On VI+ we also need to set MTYPE = 2
5565 ; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000
5666 ; Make sure we generate flat store for HSA
57 ; HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
67 ; PRE-GFX10: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
68 ; GFX10: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
5869
5970 ; HSA: .Lfunc_end0:
6071 ; HSA: .size simple, .Lfunc_end0-simple
22 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
33 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NODL %s
44 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-DL %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DL %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DL %s
57
68 ; add(mul(S0.x, S1.y),
79 ; add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
9597 ; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3
9698 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
9799 ; GFX9-DL-NEXT: s_endpgm
100 ;
101 ; GFX10-DL-LABEL: udot2:
102 ; GFX10-DL: ; %bb.0: ; %entry
103 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
104 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
105 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
106 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
107 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
108 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
109 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
110 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
111 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
112 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
113 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4
114 ; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v2
115 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
116 ; GFX10-DL-NEXT: s_endpgm
98117 <2 x i16> addrspace(1)* %src2,
99118 i32 addrspace(1)* nocapture %dst) {
100119 entry:
219238 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
220239 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
221240 ; GFX9-DL-NEXT: s_endpgm
241 ;
242 ; GFX10-DL-LABEL: udot2_MulMul:
243 ; GFX10-DL: ; %bb.0: ; %entry
244 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
245 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
246 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
247 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
248 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
249 ; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
250 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
251 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
252 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
253 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
254 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
255 ; GFX10-DL-NEXT: s_and_b32 s0, s3, s2
256 ; GFX10-DL-NEXT: s_and_b32 s1, s4, s2
257 ; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 16
258 ; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16
259 ; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s1, s0
260 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2
261 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s5, v2
262 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
263 ; GFX10-DL-NEXT: s_endpgm
222264 <2 x i16> addrspace(1)* %src2,
223265 i32 addrspace(1)* nocapture %dst) {
224266 entry:
329371 ; GFX9-DL-NEXT: v_dot2_i32_i16 v2, s3, v2, v3
330372 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
331373 ; GFX9-DL-NEXT: s_endpgm
374 ;
375 ; GFX10-DL-LABEL: idot2:
376 ; GFX10-DL: ; %bb.0: ; %entry
377 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
378 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
379 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
380 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
381 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
382 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
383 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
384 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
385 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
386 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
387 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4
388 ; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s3, s2, v2
389 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
390 ; GFX10-DL-NEXT: s_endpgm
332391 <2 x i16> addrspace(1)* %src2,
333392 i32 addrspace(1)* nocapture %dst) {
334393 entry:
446505 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
447506 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
448507 ; GFX9-DL-NEXT: s_endpgm
508 ;
509 ; GFX10-DL-LABEL: idot2_MixedTypedMul:
510 ; GFX10-DL: ; %bb.0: ; %entry
511 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
512 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
513 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
514 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
515 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
516 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
517 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
518 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
519 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
520 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
521 ; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 16
522 ; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 16
523 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4
524 ; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2
525 ; GFX10-DL-NEXT: s_sext_i32_i16 s3, s3
526 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2
527 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v2
528 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
529 ; GFX10-DL-NEXT: s_endpgm
449530 <2 x i16> addrspace(1)* %src2,
450531 i32 addrspace(1)* nocapture %dst) {
451532 entry:
560641 ; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3
561642 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
562643 ; GFX9-DL-NEXT: s_endpgm
644 ;
645 ; GFX10-DL-LABEL: udot2_alt_AddOperands:
646 ; GFX10-DL: ; %bb.0: ; %entry
647 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
648 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
649 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
650 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
651 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
652 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
653 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
654 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
655 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
656 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
657 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4
658 ; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v2
659 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
660 ; GFX10-DL-NEXT: s_endpgm
563661 <2 x i16> addrspace(1)* %src2,
564662 i32 addrspace(1)* nocapture %dst) {
565663 entry:
677775 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
678776 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
679777 ; GFX9-DL-NEXT: s_endpgm
778 ;
779 ; GFX10-DL-LABEL: idot2_MixedExt:
780 ; GFX10-DL: ; %bb.0: ; %entry
781 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
782 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
783 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
784 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
785 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
786 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
787 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
788 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
789 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
790 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
791 ; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 16
792 ; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 16
793 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4
794 ; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2
795 ; GFX10-DL-NEXT: s_and_b32 s3, s3, 0xffff
796 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2
797 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v2
798 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
799 ; GFX10-DL-NEXT: s_endpgm
680800 <2 x i16> addrspace(1)* %src2,
681801 i32 addrspace(1)* nocapture %dst) {
682802 entry:
778898 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v2
779899 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
780900 ; GFX9-DL-NEXT: s_endpgm
901 ;
902 ; GFX10-DL-LABEL: notudot2_SameVec:
903 ; GFX10-DL: ; %bb.0: ; %entry
904 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
905 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
906 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
907 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
908 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
909 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
910 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
911 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
912 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
913 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
914 ; GFX10-DL-NEXT: s_and_b32 s0, s2, 0xffff
915 ; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 16
916 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s1, s4
917 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v2
918 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
919 ; GFX10-DL-NEXT: s_endpgm
781920 <2 x i16> addrspace(1)* %src2,
782921 i32 addrspace(1)* nocapture %dst) {
783922 entry:
8921031 ; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3
8931032 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
8941033 ; GFX9-DL-NEXT: s_endpgm
1034 ;
1035 ; GFX10-DL-LABEL: udot2_v4i16:
1036 ; GFX10-DL: ; %bb.0: ; %entry
1037 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1038 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1039 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1040 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1041 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1042 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
1043 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
1044 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1045 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1046 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1047 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4
1048 ; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v2
1049 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
1050 ; GFX10-DL-NEXT: s_endpgm
8951051 <4 x i16> addrspace(1)* %src2,
8961052 i32 addrspace(1)* nocapture %dst) {
8971053 entry:
10061162 ; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3
10071163 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
10081164 ; GFX9-DL-NEXT: s_endpgm
1165 ;
1166 ; GFX10-DL-LABEL: udot2_v4i16_Hi:
1167 ; GFX10-DL: ; %bb.0: ; %entry
1168 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1169 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1170 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1171 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1172 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x4
1173 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x4
1174 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
1175 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1176 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1177 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1178 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4
1179 ; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v2
1180 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
1181 ; GFX10-DL-NEXT: s_endpgm
10091182 <4 x i16> addrspace(1)* %src2,
10101183 i32 addrspace(1)* nocapture %dst) {
10111184 entry:
11271300 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
11281301 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
11291302 ; GFX9-DL-NEXT: s_endpgm
1303 ;
1304 ; GFX10-DL-LABEL: notudot2_v4i16_Even:
1305 ; GFX10-DL: ; %bb.0: ; %entry
1306 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1307 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1308 ; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff
1309 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1310 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1311 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1312 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1313 ; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0
1314 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1315 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1316 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1317 ; GFX10-DL-NEXT: s_and_b32 s0, s3, s8
1318 ; GFX10-DL-NEXT: s_and_b32 s1, s5, s8
1319 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s6
1320 ; GFX10-DL-NEXT: s_and_b32 s2, s2, s8
1321 ; GFX10-DL-NEXT: s_and_b32 s3, s4, s8
1322 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2
1323 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2
1324 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
1325 ; GFX10-DL-NEXT: s_endpgm
11301326 <4 x i16> addrspace(1)* %src2,
11311327 i32 addrspace(1)* nocapture %dst) {
11321328 entry:
12481444 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
12491445 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
12501446 ; GFX9-DL-NEXT: s_endpgm
1447 ;
1448 ; GFX10-DL-LABEL: notudot2_v4i16_Middle:
1449 ; GFX10-DL: ; %bb.0: ; %entry
1450 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1451 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1452 ; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff
1453 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1454 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1455 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1456 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1457 ; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0
1458 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1459 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1460 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1461 ; GFX10-DL-NEXT: s_and_b32 s0, s3, s8
1462 ; GFX10-DL-NEXT: s_and_b32 s1, s5, s8
1463 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s6
1464 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16
1465 ; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16
1466 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2
1467 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2
1468 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
1469 ; GFX10-DL-NEXT: s_endpgm
12511470 <4 x i16> addrspace(1)* %src2,
12521471 i32 addrspace(1)* nocapture %dst) {
12531472 entry:
13691588 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
13701589 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
13711590 ; GFX9-DL-NEXT: s_endpgm
1591 ;
1592 ; GFX10-DL-LABEL: notudot2_DiffIndex:
1593 ; GFX10-DL: ; %bb.0: ; %entry
1594 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1595 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1596 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
1597 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1598 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1599 ; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
1600 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1601 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1602 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1603 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1604 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1605 ; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 16
1606 ; GFX10-DL-NEXT: s_and_b32 s1, s4, s2
1607 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5
1608 ; GFX10-DL-NEXT: s_and_b32 s2, s3, s2
1609 ; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16
1610 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2
1611 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2
1612 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
1613 ; GFX10-DL-NEXT: s_endpgm
13721614 <2 x i16> addrspace(1)* %src2,
13731615 i32 addrspace(1)* nocapture %dst) {
13741616 entry:
14941736 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
14951737 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
14961738 ; GFX9-DL-NEXT: s_endpgm
1739 ;
1740 ; GFX10-DL-LABEL: udot2_MultipleUses_add1:
1741 ; GFX10-DL: ; %bb.0: ; %entry
1742 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1743 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1744 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
1745 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1746 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1747 ; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
1748 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1749 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1750 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1751 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1752 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1753 ; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 16
1754 ; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 16
1755 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5
1756 ; GFX10-DL-NEXT: s_and_b32 s3, s3, s2
1757 ; GFX10-DL-NEXT: s_and_b32 s2, s4, s2
1758 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2
1759 ; GFX10-DL-NEXT: v_mad_u32_u24 v3, s2, s3, v2
1760 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2
1761 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
1762 ; GFX10-DL-NEXT: s_endpgm
14971763 <2 x i16> addrspace(1)* %src2,
14981764 i32 addrspace(1)* nocapture %dst) {
14991765 entry:
16171883 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2
16181884 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
16191885 ; GFX9-DL-NEXT: s_endpgm
1886 ;
1887 ; GFX10-DL-LABEL: idot2_MultipleUses_add1:
1888 ; GFX10-DL: ; %bb.0: ; %entry
1889 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1890 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1891 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1892 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1893 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1894 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
1895 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
1896 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1897 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1898 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1899 ; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 16
1900 ; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 16
1901 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4
1902 ; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2
1903 ; GFX10-DL-NEXT: s_sext_i32_i16 s3, s3
1904 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2
1905 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s3, s2, v2
1906 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2
1907 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
1908 ; GFX10-DL-NEXT: s_endpgm
16201909 <2 x i16> addrspace(1)* %src2,
16211910 i32 addrspace(1)* nocapture %dst) {
16221911 entry:
17442033 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
17452034 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
17462035 ; GFX9-DL-NEXT: s_endpgm
2036 ;
2037 ; GFX10-DL-LABEL: udot2_MultipleUses_mul1:
2038 ; GFX10-DL: ; %bb.0: ; %entry
2039 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2040 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2041 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
2042 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
2043 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2044 ; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
2045 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
2046 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
2047 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
2048 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
2049 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2050 ; GFX10-DL-NEXT: s_and_b32 s0, s3, s2
2051 ; GFX10-DL-NEXT: s_and_b32 s1, s4, s2
2052 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5
2053 ; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 16
2054 ; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16
2055 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2
2056 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2
2057 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2
2058 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
2059 ; GFX10-DL-NEXT: s_endpgm
17472060 <2 x i16> addrspace(1)* %src2,
17482061 i32 addrspace(1)* nocapture %dst) {
17492062 entry:
18682181 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
18692182 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
18702183 ; GFX9-DL-NEXT: s_endpgm
2184 ;
2185 ; GFX10-DL-LABEL: idot2_MultipleUses_mul1:
2186 ; GFX10-DL: ; %bb.0: ; %entry
2187 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2188 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2189 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
2190 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2191 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
2192 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
2193 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
2194 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
2195 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
2196 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2197 ; GFX10-DL-NEXT: s_sext_i32_i16 s0, s2
2198 ; GFX10-DL-NEXT: s_sext_i32_i16 s1, s3
2199 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4
2200 ; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 16
2201 ; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 16
2202 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2
2203 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v2
2204 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2
2205 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
2206 ; GFX10-DL-NEXT: s_endpgm
18712207 <2 x i16> addrspace(1)* %src2,
18722208 i32 addrspace(1)* nocapture %dst) {
18732209 entry:
19962332 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
19972333 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
19982334 ; GFX9-DL-NEXT: s_endpgm
2335 ;
2336 ; GFX10-DL-LABEL: udot2_MultipleUses_mul2:
2337 ; GFX10-DL: ; %bb.0: ; %entry
2338 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2339 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2340 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
2341 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
2342 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2343 ; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
2344 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
2345 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
2346 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
2347 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
2348 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2349 ; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 16
2350 ; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 16
2351 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5
2352 ; GFX10-DL-NEXT: s_and_b32 s3, s3, s2
2353 ; GFX10-DL-NEXT: s_and_b32 s2, s4, s2
2354 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2
2355 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2
2356 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
2357 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
2358 ; GFX10-DL-NEXT: s_endpgm
19992359 <2 x i16> addrspace(1)* %src2,
20002360 i32 addrspace(1)* nocapture %dst) {
20012361 entry:
21202480 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
21212481 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
21222482 ; GFX9-DL-NEXT: s_endpgm
2483 ;
2484 ; GFX10-DL-LABEL: idot2_MultipleUses_mul2:
2485 ; GFX10-DL: ; %bb.0: ; %entry
2486 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2487 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2488 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
2489 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2490 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
2491 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
2492 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
2493 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
2494 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
2495 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2496 ; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 16
2497 ; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 16
2498 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4
2499 ; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2
2500 ; GFX10-DL-NEXT: s_sext_i32_i16 s3, s3
2501 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2
2502 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2
2503 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v2
2504 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
2505 ; GFX10-DL-NEXT: s_endpgm
21232506 <2 x i16> addrspace(1)* %src2,
21242507 i32 addrspace(1)* nocapture %dst) {
21252508 entry:
22372620 ; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s2, v3, v2
22382621 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
22392622 ; GFX9-DL-NEXT: s_endpgm
2623 ;
2624 ; GFX10-DL-LABEL: udot2_acc16:
2625 ; GFX10-DL: ; %bb.0: ; %entry
2626 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2627 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
2628 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2629 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
2630 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
2631 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2632 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
2633 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2634 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
2635 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
2636 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2637 ; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s0, s1, v2
2638 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
2639 ; GFX10-DL-NEXT: s_endpgm
22402640 <2 x i16> addrspace(1)* %src2,
22412641 i16 addrspace(1)* nocapture %dst) {
22422642 entry:
23692769 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, v5, v2, v3
23702770 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
23712771 ; GFX9-DL-NEXT: s_endpgm
2772 ;
2773 ; GFX10-DL-LABEL: notsdot2_sext8:
2774 ; GFX10-DL: ; %bb.0: ; %entry
2775 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2776 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2777 ; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff
2778 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
2779 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2780 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
2781 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4
2782 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, s5
2783 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s7
2784 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
2785 ; GFX10-DL-NEXT: global_load_ushort v2, v[2:3], off
2786 ; GFX10-DL-NEXT: global_load_ushort v7, v[0:1], off
2787 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
2788 ; GFX10-DL-NEXT: v_and_b32_sdwa v1, v2, v4
2789 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2790 ; GFX10-DL-NEXT: v_and_b32_sdwa v3, v7, v4
2791 ; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8
2792 ; GFX10-DL-NEXT: v_bfe_i32 v0, v7, 0, 8
2793 ; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8
2794 ; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8
2795 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2796 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, v3, v1, s2
2797 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, v0, v2, v1
2798 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
2799 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
2800 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
2801 ; GFX10-DL-NEXT: s_endpgm
23722802 <2 x i8> addrspace(1)* %src2,
23732803 i32 addrspace(1)* nocapture %dst) {
23742804 entry:
22 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
33 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s
44 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
57
68 define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
79 ; GFX7-LABEL: idot4_acc32:
113115 ; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v2, v3
114116 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
115117 ; GFX9-DL-NEXT: s_endpgm
118 ;
119 ; GFX10-DL-LABEL: idot4_acc32:
120 ; GFX10-DL: ; %bb.0: ; %entry
121 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
122 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
123 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
124 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
125 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
126 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
127 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
128 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
129 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
130 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
131 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4
132 ; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s2, s3, v2
133 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
134 ; GFX10-DL-NEXT: s_endpgm
116135 <4 x i8> addrspace(1)* %src2,
117136 i32 addrspace(1)* nocapture %dst) {
118137 entry:
273292 ; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v3, v2
274293 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
275294 ; GFX9-DL-NEXT: s_endpgm
295 ;
296 ; GFX10-DL-LABEL: idot4_acc16:
297 ; GFX10-DL: ; %bb.0: ; %entry
298 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
299 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
300 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
301 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
302 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
303 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
304 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
305 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
306 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
307 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
308 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
309 ; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v2
310 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
311 ; GFX10-DL-NEXT: s_endpgm
276312 <4 x i8> addrspace(1)* %src2,
277313 i16 addrspace(1)* nocapture %dst) {
278314 entry:
425461 ; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2
426462 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
427463 ; GFX9-DL-NEXT: s_endpgm
464 ;
465 ; GFX10-DL-LABEL: idot4_acc8:
466 ; GFX10-DL: ; %bb.0: ; %entry
467 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
468 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
469 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
470 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
471 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
472 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
473 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
474 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
475 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
476 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
477 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
478 ; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2
479 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
480 ; GFX10-DL-NEXT: s_endpgm
428481 <4 x i8> addrspace(1)* %src2,
429482 i8 addrspace(1)* nocapture %dst) {
430483 entry:
584637 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v3, v2
585638 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
586639 ; GFX9-DL-NEXT: s_endpgm
640 ;
641 ; GFX10-DL-LABEL: idot4_multiuse_mul1:
642 ; GFX10-DL: ; %bb.0: ; %entry
643 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
644 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
645 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
646 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
647 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
648 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
649 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
650 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
651 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
652 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
653 ; GFX10-DL-NEXT: s_sext_i32_i8 s0, s2
654 ; GFX10-DL-NEXT: s_sext_i32_i8 s1, s3
655 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4
656 ; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80008
657 ; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80008
658 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2
659 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2
660 ; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80010
661 ; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80010
662 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2
663 ; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 24
664 ; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 24
665 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2
666 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2
667 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
668 ; GFX10-DL-NEXT: s_endpgm
587669 <4 x i8> addrspace(1)* %src2,
588670 i32 addrspace(1)* nocapture %dst) {
589671 entry:
753835 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
754836 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
755837 ; GFX9-DL-NEXT: s_endpgm
838 ;
839 ; GFX10-DL-LABEL: idot4_acc32_vecMul:
840 ; GFX10-DL: ; %bb.0: ; %entry
841 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
842 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
843 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
844 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
845 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
846 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
847 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
848 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
849 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
850 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
851 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
852 ; GFX10-DL-NEXT: v_and_b32_sdwa v3, s2, v2
853 ; GFX10-DL-NEXT: v_and_b32_sdwa v2, s3, v2
854 ; GFX10-DL-NEXT: v_mov_b32_e32 v4, s4
855 ; GFX10-DL-NEXT: s_sext_i32_i8 s0, s2
856 ; GFX10-DL-NEXT: s_sext_i32_i8 s1, s3
857 ; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8
858 ; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8
859 ; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80010
860 ; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80010
861 ; GFX10-DL-NEXT: v_mad_i32_i24 v4, s0, s1, v4
862 ; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 24
863 ; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 24
864 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, v3, v2, v4
865 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2
866 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2
867 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
868 ; GFX10-DL-NEXT: s_endpgm
756869 <4 x i8> addrspace(1)* %src2,
757870 i32 addrspace(1)* nocapture %dst) {
758871 entry:
9381051 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
9391052 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
9401053 ; GFX9-DL-NEXT: s_endpgm
1054 ;
1055 ; GFX10-DL-LABEL: idot4_acc16_vecMul:
1056 ; GFX10-DL: ; %bb.0: ; %entry
1057 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1058 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1059 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
1060 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1061 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1062 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1063 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
1064 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1065 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1066 ; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off
1067 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1068 ; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x80000
1069 ; GFX10-DL-NEXT: s_bfe_i32 s1, s3, 0x80000
1070 ; GFX10-DL-NEXT: s_lshr_b32 s4, s2, 16
1071 ; GFX10-DL-NEXT: s_lshr_b32 s5, s3, 16
1072 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, sext(s2), v2
1073 ; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v2
1074 ; GFX10-DL-NEXT: v_and_b32_e32 v6, s1, v2
1075 ; GFX10-DL-NEXT: v_and_b32_sdwa v5, sext(s3), v2
1076 ; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x80000
1077 ; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x80000
1078 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7
1079 ; GFX10-DL-NEXT: v_and_b32_sdwa v8, sext(s4), v2
1080 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6
1081 ; GFX10-DL-NEXT: v_and_b32_sdwa v6, sext(s5), v2
1082 ; GFX10-DL-NEXT: v_and_b32_e32 v7, s1, v2
1083 ; GFX10-DL-NEXT: v_and_b32_e32 v2, s0, v2
1084 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
1085 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v7
1086 ; GFX10-DL-NEXT: v_lshl_or_b32 v2, v8, 16, v2
1087 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v5
1088 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1089 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3
1090 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1091 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v2
1092 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1093 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
1094 ; GFX10-DL-NEXT: s_endpgm
9411095 <4 x i8> addrspace(1)* %src2,
9421096 i16 addrspace(1)* nocapture %dst) {
9431097 entry:
22 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
33 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s
44 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
57
68 define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
79 ; GFX7-LABEL: udot4_acc32:
116118 ; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v2, v3
117119 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
118120 ; GFX9-DL-NEXT: s_endpgm
121 ;
122 ; GFX10-DL-LABEL: udot4_acc32:
123 ; GFX10-DL: ; %bb.0: ; %entry
124 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
125 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
126 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
127 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
128 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
129 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
130 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
131 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
132 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
133 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
134 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4
135 ; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s2, s3, v2
136 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
137 ; GFX10-DL-NEXT: s_endpgm
119138 <4 x i8> addrspace(1)* %src2,
120139 i32 addrspace(1)* nocapture %dst) {
121140 entry:
269288 ; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2
270289 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
271290 ; GFX9-DL-NEXT: s_endpgm
291 ;
292 ; GFX10-DL-LABEL: udot4_acc16:
293 ; GFX10-DL: ; %bb.0: ; %entry
294 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
295 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
296 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
297 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
298 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
299 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
300 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
301 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
302 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
303 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
304 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
305 ; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2
306 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
307 ; GFX10-DL-NEXT: s_endpgm
272308 <4 x i8> addrspace(1)* %src2,
273309 i16 addrspace(1)* nocapture %dst) {
274310 entry:
422458 ; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2
423459 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
424460 ; GFX9-DL-NEXT: s_endpgm
461 ;
462 ; GFX10-DL-LABEL: udot4_acc8:
463 ; GFX10-DL: ; %bb.0: ; %entry
464 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
465 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
466 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
467 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
468 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
469 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
470 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
471 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
472 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
473 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
474 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
475 ; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2
476 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
477 ; GFX10-DL-NEXT: s_endpgm
425478 <4 x i8> addrspace(1)* %src2,
426479 i8 addrspace(1)* nocapture %dst) {
427480 entry:
551604 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
552605 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
553606 ; GFX9-DL-NEXT: s_endpgm
607 ;
608 ; GFX10-DL-LABEL: udot2_8:
609 ; GFX10-DL: ; %bb.0: ; %entry
610 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
611 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
612 ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
613 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
614 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
615 ; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
616 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
617 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
618 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
619 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
620 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
621 ; GFX10-DL-NEXT: s_and_b32 s0, s3, s2
622 ; GFX10-DL-NEXT: s_and_b32 s1, s4, s2
623 ; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x80008
624 ; GFX10-DL-NEXT: s_bfe_u32 s3, s3, 0x80008
625 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
626 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
627 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2
628 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
629 ; GFX10-DL-NEXT: s_endpgm
554630 <4 x i8> addrspace(1)* %src2,
555631 i8 addrspace(1)* nocapture %dst) {
556632 entry:
685761 ; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s3, v3, v2
686762 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
687763 ; GFX9-DL-NEXT: s_endpgm
764 ;
765 ; GFX10-DL-LABEL: udot4_CommutationInsideMAD:
766 ; GFX10-DL: ; %bb.0: ; %entry
767 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
768 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
769 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
770 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
771 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
772 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
773 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
774 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
775 ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
776 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
777 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
778 ; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s1, s0, v2
779 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
780 ; GFX10-DL-NEXT: s_endpgm
688781 <4 x i8> addrspace(1)* %src2,
689782 i8 addrspace(1)* nocapture %dst) {
690783 entry:
846939 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
847940 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
848941 ; GFX9-DL-NEXT: s_endpgm
942 ;
943 ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs:
944 ; GFX10-DL: ; %bb.0: ; %entry
945 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
946 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
947 ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
948 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
949 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
950 ; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
951 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
952 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
953 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
954 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
955 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
956 ; GFX10-DL-NEXT: s_bfe_u32 s0, s3, 0x80008
957 ; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x80008
958 ; GFX10-DL-NEXT: s_and_b32 s5, s3, s2
959 ; GFX10-DL-NEXT: s_and_b32 s2, s4, s2
960 ; GFX10-DL-NEXT: s_bfe_u32 s6, s3, 0x80010
961 ; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80010
962 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
963 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2
964 ; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 24
965 ; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 24
966 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2
967 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s6, v2
968 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2
969 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
970 ; GFX10-DL-NEXT: s_endpgm
849971 <4 x i8> addrspace(1)* %src2,
850972 i8 addrspace(1)* nocapture %dst) {
851973 entry:
10101132 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
10111133 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
10121134 ; GFX9-DL-NEXT: s_endpgm
1135 ;
1136 ; GFX10-DL-LABEL: udot4_multiuse_mul1:
1137 ; GFX10-DL: ; %bb.0: ; %entry
1138 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1139 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1140 ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
1141 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1142 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1143 ; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
1144 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1145 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1146 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1147 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1148 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1149 ; GFX10-DL-NEXT: s_and_b32 s0, s3, s2
1150 ; GFX10-DL-NEXT: s_and_b32 s1, s4, s2
1151 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5
1152 ; GFX10-DL-NEXT: s_bfe_u32 s2, s3, 0x80008
1153 ; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x80008
1154 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
1155 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2
1156 ; GFX10-DL-NEXT: s_bfe_u32 s2, s3, 0x80010
1157 ; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x80010
1158 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
1159 ; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 24
1160 ; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 24
1161 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2
1162 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
1163 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
1164 ; GFX10-DL-NEXT: s_endpgm
10131165 <4 x i8> addrspace(1)* %src2,
10141166 i32 addrspace(1)* nocapture %dst) {
10151167 entry:
11871339 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
11881340 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
11891341 ; GFX9-DL-NEXT: s_endpgm
1342 ;
1343 ; GFX10-DL-LABEL: udot4_multiuse_add1:
1344 ; GFX10-DL: ; %bb.0: ; %entry
1345 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1346 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1347 ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
1348 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1349 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1350 ; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
1351 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1352 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1353 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1354 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1355 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1356 ; GFX10-DL-NEXT: s_bfe_u32 s0, s3, 0x80008
1357 ; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x80008
1358 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5
1359 ; GFX10-DL-NEXT: s_and_b32 s6, s3, s2
1360 ; GFX10-DL-NEXT: s_and_b32 s2, s4, s2
1361 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
1362 ; GFX10-DL-NEXT: s_bfe_u32 s0, s3, 0x80010
1363 ; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x80010
1364 ; GFX10-DL-NEXT: v_mad_u32_u24 v3, s6, s2, v2
1365 ; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 24
1366 ; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 24
1367 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s5, v2
1368 ; GFX10-DL-NEXT: v_mad_u32_u24 v3, s0, s1, v3
1369 ; GFX10-DL-NEXT: v_mad_u32_u24 v3, s2, s3, v3
1370 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2
1371 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
1372 ; GFX10-DL-NEXT: s_endpgm
11901373 <4 x i8> addrspace(1)* %src2,
11911374 i32 addrspace(1)* nocapture %dst) {
11921375 entry:
13551538 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
13561539 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
13571540 ; GFX9-DL-NEXT: s_endpgm
1541 ;
1542 ; GFX10-DL-LABEL: notdot4_mixedtypes:
1543 ; GFX10-DL: ; %bb.0: ; %entry
1544 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1545 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1546 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1547 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1548 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1549 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
1550 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1551 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1552 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
1553 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1554 ; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x80008
1555 ; GFX10-DL-NEXT: s_bfe_u32 s1, s3, 0x80008
1556 ; GFX10-DL-NEXT: s_sext_i32_i8 s4, s2
1557 ; GFX10-DL-NEXT: s_sext_i32_i8 s5, s3
1558 ; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x80010
1559 ; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80010
1560 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1561 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
1562 ; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 24
1563 ; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 24
1564 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2
1565 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s6, s7, v2
1566 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
1567 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
1568 ; GFX10-DL-NEXT: s_endpgm
13581569 <4 x i8> addrspace(1)* %src2,
13591570 i16 addrspace(1)* nocapture %dst) {
13601571 entry:
15211732 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
15221733 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
15231734 ; GFX9-DL-NEXT: s_endpgm
1735 ;
1736 ; GFX10-DL-LABEL: udot4_acc32_vecMul:
1737 ; GFX10-DL: ; %bb.0: ; %entry
1738 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1739 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1740 ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
1741 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
1742 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1743 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1744 ; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
1745 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1746 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1747 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1748 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1749 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1750 ; GFX10-DL-NEXT: s_and_b32 s0, s3, s2
1751 ; GFX10-DL-NEXT: s_and_b32 s1, s4, s2
1752 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, s5
1753 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
1754 ; GFX10-DL-NEXT: v_and_b32_sdwa v2, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
1755 ; GFX10-DL-NEXT: s_bfe_u32 s2, s3, 0x80010
1756 ; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x80010
1757 ; GFX10-DL-NEXT: v_mad_u32_u24 v3, s0, s1, v3
1758 ; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 24
1759 ; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 24
1760 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v4, v2, v3
1761 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2
1762 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
1763 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
1764 ; GFX10-DL-NEXT: s_endpgm
15241765 <4 x i8> addrspace(1)* %src2,
15251766 i32 addrspace(1)* nocapture %dst) {
15261767 entry:
16921933 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
16931934 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
16941935 ; GFX9-DL-NEXT: s_endpgm
1936 ;
1937 ; GFX10-DL-LABEL: udot4_acc16_vecMul:
1938 ; GFX10-DL: ; %bb.0: ; %entry
1939 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1940 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1941 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
1942 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1943 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1944 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1945 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
1946 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1947 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1948 ; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off
1949 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1950 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
1951 ; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1952 ; GFX10-DL-NEXT: v_and_b32_sdwa v5, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
1953 ; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1954 ; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 16
1955 ; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 16
1956 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7
1957 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24
1958 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6
1959 ; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24
1960 ; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1961 ; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1962 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
1963 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, s3, 16, v6
1964 ; GFX10-DL-NEXT: v_lshl_or_b32 v2, s2, 16, v2
1965 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v5
1966 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1967 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3
1968 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1969 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v2
1970 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1971 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
1972 ; GFX10-DL-NEXT: s_endpgm
16951973 <4 x i8> addrspace(1)* %src2,
16961974 i16 addrspace(1)* nocapture %dst) {
16971975 entry:
18732151 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
18742152 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
18752153 ; GFX9-DL-NEXT: s_endpgm
2154 ;
2155 ; GFX10-DL-LABEL: udot4_acc8_vecMul:
2156 ; GFX10-DL: ; %bb.0: ; %entry
2157 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2158 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2159 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
2160 ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
2161 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
2162 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2163 ; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
2164 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
2165 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
2166 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
2167 ; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off
2168 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2169 ; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 24
2170 ; GFX10-DL-NEXT: s_lshr_b32 s5, s4, 24
2171 ; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 16
2172 ; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 16
2173 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
2174 ; GFX10-DL-NEXT: v_and_b32_sdwa v5, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
2175 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s3, s4
2176 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s5
2177 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, s1, s6
2178 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v5
2179 ; GFX10-DL-NEXT: v_and_b32_sdwa v5, v6, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2180 ; GFX10-DL-NEXT: v_and_b32_sdwa v6, v7, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2181 ; GFX10-DL-NEXT: v_and_b32_sdwa v7, v8, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2182 ; GFX10-DL-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2183 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2184 ; GFX10-DL-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2185 ; GFX10-DL-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2186 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2
2187 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2188 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v2, v3
2189 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v4
2190 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2191 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2192 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
2193 ; GFX10-DL-NEXT: s_endpgm
18762194 <4 x i8> addrspace(1)* %src2,
18772195 i8 addrspace(1)* nocapture %dst) {
18782196 entry:
22 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
33 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
44 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
57
68 define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
79 ; GFX7-LABEL: idot8_acc32:
161163 ; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s2, v2, v3
162164 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
163165 ; GFX9-DL-NEXT: s_endpgm
166 ;
167 ; GFX10-DL-LABEL: idot8_acc32:
168 ; GFX10-DL: ; %bb.0: ; %entry
169 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
170 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
171 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
172 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
173 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
174 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
175 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
176 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
177 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
178 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
179 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5
180 ; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s2, s4, v2
181 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
182 ; GFX10-DL-NEXT: s_endpgm
164183 <8 x i4> addrspace(1)* %src2,
165184 i32 addrspace(1)* nocapture %dst) {
166185 entry:
448467 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v3, v2
449468 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
450469 ; GFX9-DL-NEXT: s_endpgm
470 ;
471 ; GFX10-DL-LABEL: idot8_acc16:
472 ; GFX10-DL: ; %bb.0: ; %entry
473 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
474 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
475 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
476 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
477 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
478 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
479 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
480 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
481 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
482 ; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off
483 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
484 ; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 12
485 ; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 12
486 ; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40000
487 ; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40000
488 ; GFX10-DL-NEXT: s_bfe_i32 s7, s2, 0x40004
489 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s0
490 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1
491 ; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40004
492 ; GFX10-DL-NEXT: s_bfe_i32 s1, s2, 0x40008
493 ; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40008
494 ; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2
495 ; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2
496 ; GFX10-DL-NEXT: s_bfe_i32 s9, s2, 0x40010
497 ; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40010
498 ; GFX10-DL-NEXT: v_mul_i32_i24_e64 v6, s1, s8
499 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4
500 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5
501 ; GFX10-DL-NEXT: s_bfe_i32 s1, s2, 0x40014
502 ; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40014
503 ; GFX10-DL-NEXT: s_bfe_i32 s11, s2, 0x40018
504 ; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2
505 ; GFX10-DL-NEXT: v_and_b32_e32 v2, v5, v2
506 ; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40018
507 ; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 28
508 ; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28
509 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
510 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s5, s6, v3
511 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s7, s0, v3
512 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
513 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v4, v2, v3
514 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s9, s10, v2
515 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s8, v2
516 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s11, s12, v2
517 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2
518 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
519 ; GFX10-DL-NEXT: s_endpgm
451520 <8 x i4> addrspace(1)* %src2,
452521 i16 addrspace(1)* nocapture %dst) {
453522 entry:
743812 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
744813 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
745814 ; GFX9-DL-NEXT: s_endpgm
815 ;
816 ; GFX10-DL-LABEL: idot8_acc8:
817 ; GFX10-DL: ; %bb.0: ; %entry
818 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
819 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
820 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
821 ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
822 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
823 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
824 ; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0
825 ; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0
826 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
827 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
828 ; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off
829 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
830 ; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 12
831 ; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 12
832 ; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40000
833 ; GFX10-DL-NEXT: s_bfe_i32 s7, s5, 0x40000
834 ; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004
835 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s0
836 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1
837 ; GFX10-DL-NEXT: s_bfe_i32 s0, s5, 0x40004
838 ; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40008
839 ; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40008
840 ; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2
841 ; GFX10-DL-NEXT: v_and_b32_e32 v2, v5, v2
842 ; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40010
843 ; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40010
844 ; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s1, s9
845 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4
846 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2
847 ; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40014
848 ; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40014
849 ; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40018
850 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
851 ; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
852 ; GFX10-DL-NEXT: s_bfe_i32 s2, s5, 0x40018
853 ; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28
854 ; GFX10-DL-NEXT: s_ashr_i32 s5, s5, 28
855 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
856 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s6, s7, v3
857 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s8, s0, v3
858 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
859 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v4, v2, v3
860 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s11, v2
861 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s9, v2
862 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s12, s2, v2
863 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2
864 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
865 ; GFX10-DL-NEXT: s_endpgm
746866 <8 x i4> addrspace(1)* %src2,
747867 i8 addrspace(1)* nocapture %dst) {
748868 entry:
10091129 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2
10101130 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
10111131 ; GFX9-DL-NEXT: s_endpgm
1132 ;
1133 ; GFX10-DL-LABEL: idot8_multiuses_mul1:
1134 ; GFX10-DL: ; %bb.0: ; %entry
1135 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1136 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1137 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1138 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1139 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1140 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1141 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1142 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1143 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1144 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1145 ; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x40000
1146 ; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40000
1147 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5
1148 ; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40004
1149 ; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40004
1150 ; GFX10-DL-NEXT: s_bfe_i32 s7, s2, 0x40008
1151 ; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40008
1152 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2
1153 ; GFX10-DL-NEXT: s_bfe_i32 s9, s2, 0x4000c
1154 ; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x4000c
1155 ; GFX10-DL-NEXT: s_bfe_i32 s11, s2, 0x40010
1156 ; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40010
1157 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s0, s1, v2
1158 ; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x40014
1159 ; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40014
1160 ; GFX10-DL-NEXT: s_bfe_i32 s13, s2, 0x40018
1161 ; GFX10-DL-NEXT: s_bfe_i32 s14, s4, 0x40018
1162 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s5, s6, v3
1163 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s7, s8, v3
1164 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s9, s10, v3
1165 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s11, s12, v3
1166 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s0, s1, v3
1167 ; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 28
1168 ; GFX10-DL-NEXT: s_ashr_i32 s1, s4, 28
1169 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s13, s14, v3
1170 ; GFX10-DL-NEXT: v_mad_i32_i24 v3, s0, s1, v3
1171 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
1172 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
1173 ; GFX10-DL-NEXT: s_endpgm
10121174 <8 x i4> addrspace(1)* %src2,
10131175 i32 addrspace(1)* nocapture %dst) {
10141176 entry:
13241486 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
13251487 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
13261488 ; GFX9-DL-NEXT: s_endpgm
1489 ;
1490 ; GFX10-DL-LABEL: idot8_acc32_vecMul:
1491 ; GFX10-DL: ; %bb.0: ; %entry
1492 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1493 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1494 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1495 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1496 ; GFX10-DL-NEXT: s_load_dword s5, s[4:5], 0x0
1497 ; GFX10-DL-NEXT: s_load_dword s7, s[6:7], 0x0
1498 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
1499 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1500 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1501 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1502 ; GFX10-DL-NEXT: s_lshl_b32 s1, s5, 28
1503 ; GFX10-DL-NEXT: s_lshl_b32 s9, s7, 28
1504 ; GFX10-DL-NEXT: s_lshl_b32 s11, s5, 24
1505 ; GFX10-DL-NEXT: s_lshl_b32 s13, s7, 24
1506 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
1507 ; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[0:1], 60
1508 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
1509 ; GFX10-DL-NEXT: s_lshl_b32 s1, s5, 20
1510 ; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
1511 ; GFX10-DL-NEXT: s_lshl_b32 s9, s7, 20
1512 ; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60
1513 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s8, v2
1514 ; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[0:1], 60
1515 ; GFX10-DL-NEXT: s_lshl_b32 s11, s5, 16
1516 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
1517 ; GFX10-DL-NEXT: s_lshl_b32 s1, s7, 16
1518 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s12, v2
1519 ; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 12
1520 ; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
1521 ; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 12
1522 ; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[0:1], 60
1523 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s8, v2
1524 ; GFX10-DL-NEXT: s_lshl_b32 s1, s5, 8
1525 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
1526 ; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[10:11], 60
1527 ; GFX10-DL-NEXT: s_lshl_b32 s9, s7, 8
1528 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s12, v2
1529 ; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[0:1], 60
1530 ; GFX10-DL-NEXT: s_lshl_b32 s11, s5, 4
1531 ; GFX10-DL-NEXT: s_lshl_b32 s1, s7, 4
1532 ; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[8:9], 60
1533 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s14, v2
1534 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[10:11], 60
1535 ; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[0:1], 60
1536 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s12, v2
1537 ; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[4:5], 60
1538 ; GFX10-DL-NEXT: s_ashr_i64 s[4:5], s[6:7], 60
1539 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s10, v2
1540 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s4, v2
1541 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
1542 ; GFX10-DL-NEXT: s_endpgm
13271543 <8 x i4> addrspace(1)* %src2,
13281544 i32 addrspace(1)* nocapture %dst) {
13291545 entry:
16341850 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
16351851 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
16361852 ; GFX9-DL-NEXT: s_endpgm
1853 ;
1854 ; GFX10-DL-LABEL: idot8_acc16_vecMul:
1855 ; GFX10-DL: ; %bb.0: ; %entry
1856 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1857 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1858 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1859 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1860 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1861 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1862 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1863 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1864 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
1865 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1866 ; GFX10-DL-NEXT: s_and_b32 s0, s2, 15
1867 ; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40004
1868 ; GFX10-DL-NEXT: s_and_b32 s5, s4, 15
1869 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004
1870 ; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008
1871 ; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c
1872 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
1873 ; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40008
1874 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
1875 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x4000c
1876 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s8
1877 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1]
1878 ; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010
1879 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s5 op_sel_hi:[0,1]
1880 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s6
1881 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014
1882 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
1883 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40010
1884 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
1885 ; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40014
1886 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s1 op_sel_hi:[0,1]
1887 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s7 op_sel_hi:[0,1]
1888 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s5
1889 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4
1890 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s8
1891 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40018
1892 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1]
1893 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1]
1894 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40018
1895 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28
1896 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
1897 ; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28
1898 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1]
1899 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
1900 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s2
1901 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1]
1902 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s4
1903 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v7 op_sel_hi:[0,1]
1904 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1]
1905 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s1 op_sel_hi:[0,1]
1906 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6
1907 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v7 op_sel_hi:[0,1]
1908 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v8 op_sel_hi:[0,1]
1909 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v6, v7
1910 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1911 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2
1912 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1913 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1914 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1915 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5
1916 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1917 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v7
1918 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1919 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
1920 ; GFX10-DL-NEXT: s_endpgm
16371921 <8 x i4> addrspace(1)* %src2,
16381922 i16 addrspace(1)* nocapture %dst) {
16391923 entry:
20042288 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
20052289 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
20062290 ; GFX9-DL-NEXT: s_endpgm
2291 ;
2292 ; GFX10-DL-LABEL: idot8_acc8_vecMul:
2293 ; GFX10-DL: ; %bb.0: ; %entry
2294 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2295 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2296 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
2297 ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
2298 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
2299 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2300 ; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0
2301 ; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0
2302 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
2303 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
2304 ; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off
2305 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2306 ; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 4
2307 ; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 8
2308 ; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 12
2309 ; GFX10-DL-NEXT: s_lshr_b32 s7, s5, 4
2310 ; GFX10-DL-NEXT: s_lshr_b32 s8, s5, 8
2311 ; GFX10-DL-NEXT: s_lshr_b32 s9, s5, 12
2312 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1
2313 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s6
2314 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s0
2315 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s8
2316 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s9
2317 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s4
2318 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v19, 12, s7
2319 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s5
2320 ; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2
2321 ; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2
2322 ; GFX10-DL-NEXT: v_and_b32_e32 v6, v15, v2
2323 ; GFX10-DL-NEXT: v_and_b32_e32 v7, v7, v2
2324 ; GFX10-DL-NEXT: v_and_b32_e32 v8, v8, v2
2325 ; GFX10-DL-NEXT: v_and_b32_e32 v9, v9, v2
2326 ; GFX10-DL-NEXT: v_and_b32_e32 v10, v19, v2
2327 ; GFX10-DL-NEXT: v_and_b32_e32 v11, v11, v2
2328 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5
2329 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4
2330 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v8
2331 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v9
2332 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v10
2333 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v11
2334 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v15, 12, v6
2335 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7
2336 ; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 20
2337 ; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 24
2338 ; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 16
2339 ; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28
2340 ; GFX10-DL-NEXT: s_lshr_b32 s8, s5, 20
2341 ; GFX10-DL-NEXT: s_lshr_b32 s9, s5, 24
2342 ; GFX10-DL-NEXT: s_lshr_b32 s7, s5, 16
2343 ; GFX10-DL-NEXT: s_lshr_b32 s5, s5, 28
2344 ; GFX10-DL-NEXT: v_and_b32_e32 v23, v15, v2
2345 ; GFX10-DL-NEXT: v_and_b32_e32 v10, v19, v2
2346 ; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2
2347 ; GFX10-DL-NEXT: v_and_b32_e32 v8, v8, v2
2348 ; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2
2349 ; GFX10-DL-NEXT: v_and_b32_e32 v9, v9, v2
2350 ; GFX10-DL-NEXT: v_and_b32_e32 v22, v7, v2
2351 ; GFX10-DL-NEXT: v_and_b32_e32 v11, v11, v2
2352 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v8
2353 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s4
2354 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v9
2355 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s6
2356 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v22, v11
2357 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s0
2358 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v20, 12, s8
2359 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v23, v23, v10
2360 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v21, 12, s1
2361 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s5
2362 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s9
2363 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v19, 12, s7
2364 ; GFX10-DL-NEXT: v_and_b32_e32 v8, v12, v2
2365 ; GFX10-DL-NEXT: v_and_b32_e32 v9, v13, v2
2366 ; GFX10-DL-NEXT: v_and_b32_e32 v11, v15, v2
2367 ; GFX10-DL-NEXT: v_and_b32_e32 v12, v16, v2
2368 ; GFX10-DL-NEXT: v_and_b32_e32 v13, v17, v2
2369 ; GFX10-DL-NEXT: v_and_b32_e32 v15, v19, v2
2370 ; GFX10-DL-NEXT: v_and_b32_e32 v10, v21, v2
2371 ; GFX10-DL-NEXT: v_and_b32_e32 v14, v20, v2
2372 ; GFX10-DL-NEXT: v_and_b32_sdwa v6, v23, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2373 ; GFX10-DL-NEXT: v_and_b32_sdwa v7, v7, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2374 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2375 ; GFX10-DL-NEXT: v_and_b32_sdwa v5, v5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2376 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12
2377 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v13, 12, v13
2378 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v8
2379 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v9
2380 ; GFX10-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2381 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2382 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v10
2383 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v15, 12, v15
2384 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v11
2385 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v23, 12, v14
2386 ; GFX10-DL-NEXT: v_and_b32_e32 v5, v8, v2
2387 ; GFX10-DL-NEXT: v_and_b32_e32 v7, v9, v2
2388 ; GFX10-DL-NEXT: v_and_b32_e32 v13, v13, v2
2389 ; GFX10-DL-NEXT: v_and_b32_e32 v9, v11, v2
2390 ; GFX10-DL-NEXT: v_and_b32_e32 v12, v12, v2
2391 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2392 ; GFX10-DL-NEXT: v_and_b32_e32 v10, v15, v2
2393 ; GFX10-DL-NEXT: v_and_b32_e32 v8, v19, v2
2394 ; GFX10-DL-NEXT: v_and_b32_e32 v11, v23, v2
2395 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v13
2396 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v12
2397 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v9, v10
2398 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v4
2399 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, v8, v11
2400 ; GFX10-DL-NEXT: v_and_b32_sdwa v7, v7, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2401 ; GFX10-DL-NEXT: v_and_b32_sdwa v6, v6, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2402 ; GFX10-DL-NEXT: v_and_b32_sdwa v8, v8, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2403 ; GFX10-DL-NEXT: v_and_b32_sdwa v2, v5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2404 ; GFX10-DL-NEXT: v_or_b32_sdwa v5, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2405 ; GFX10-DL-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2406 ; GFX10-DL-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2407 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2408 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3
2409 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v9
2410 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
2411 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2412 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2
2413 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v2
2414 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v4
2415 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2416 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2417 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
2418 ; GFX10-DL-NEXT: s_endpgm
20072419 <8 x i4> addrspace(1)* %src2,
20082420 i8 addrspace(1)* nocapture %dst) {
20092421 entry:
22 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
33 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
44 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
57
68 define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
79 ; GFX7-LABEL: udot8_acc32:
161163 ; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s2, v2, v3
162164 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
163165 ; GFX9-DL-NEXT: s_endpgm
166 ;
167 ; GFX10-DL-LABEL: udot8_acc32:
168 ; GFX10-DL: ; %bb.0: ; %entry
169 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
170 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
171 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
172 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
173 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
174 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
175 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
176 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
177 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
178 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
179 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5
180 ; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s2, s4, v2
181 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
182 ; GFX10-DL-NEXT: s_endpgm
164183 <8 x i4> addrspace(1)* %src2,
165184 i32 addrspace(1)* nocapture %dst) {
166185 entry:
422441 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
423442 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
424443 ; GFX9-DL-NEXT: s_endpgm
444 ;
445 ; GFX10-DL-LABEL: udot8_acc16:
446 ; GFX10-DL: ; %bb.0: ; %entry
447 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
448 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
449 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
450 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
451 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
452 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
453 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
454 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
455 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
456 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
457 ; GFX10-DL-NEXT: s_and_b32 s0, s2, 15
458 ; GFX10-DL-NEXT: s_and_b32 s1, s4, 15
459 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004
460 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004
461 ; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008
462 ; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40008
463 ; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c
464 ; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x4000c
465 ; GFX10-DL-NEXT: s_bfe_u32 s11, s2, 0x40010
466 ; GFX10-DL-NEXT: s_bfe_u32 s12, s4, 0x40010
467 ; GFX10-DL-NEXT: s_bfe_u32 s13, s2, 0x40014
468 ; GFX10-DL-NEXT: s_bfe_u32 s14, s4, 0x40014
469 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
470 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
471 ; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018
472 ; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018
473 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28
474 ; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28
475 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2
476 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2
477 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s8, v2
478 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s9, s10, v2
479 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s11, s12, v2
480 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s13, s14, v2
481 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
482 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2
483 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
484 ; GFX10-DL-NEXT: s_endpgm
425485 <8 x i4> addrspace(1)* %src2,
426486 i16 addrspace(1)* nocapture %dst) {
427487 entry:
683743 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
684744 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
685745 ; GFX9-DL-NEXT: s_endpgm
746 ;
747 ; GFX10-DL-LABEL: udot8_acc8:
748 ; GFX10-DL: ; %bb.0: ; %entry
749 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
750 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
751 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
752 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
753 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
754 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
755 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
756 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
757 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
758 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
759 ; GFX10-DL-NEXT: s_and_b32 s0, s2, 15
760 ; GFX10-DL-NEXT: s_and_b32 s1, s4, 15
761 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004
762 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004
763 ; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008
764 ; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40008
765 ; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c
766 ; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x4000c
767 ; GFX10-DL-NEXT: s_bfe_u32 s11, s2, 0x40010
768 ; GFX10-DL-NEXT: s_bfe_u32 s12, s4, 0x40010
769 ; GFX10-DL-NEXT: s_bfe_u32 s13, s2, 0x40014
770 ; GFX10-DL-NEXT: s_bfe_u32 s14, s4, 0x40014
771 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
772 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
773 ; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018
774 ; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018
775 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28
776 ; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28
777 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2
778 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xff, v2
779 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s8, v2
780 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s9, s10, v2
781 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s11, s12, v2
782 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s13, s14, v2
783 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
784 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2
785 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
786 ; GFX10-DL-NEXT: s_endpgm
686787 <8 x i4> addrspace(1)* %src2,
687788 i8 addrspace(1)* nocapture %dst) {
688789 entry:
9541055 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
9551056 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
9561057 ; GFX9-DL-NEXT: s_endpgm
1058 ;
1059 ; GFX10-DL-LABEL: udot8_acc4:
1060 ; GFX10-DL: ; %bb.0: ; %entry
1061 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1062 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1063 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1064 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1065 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1066 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1067 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1068 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1069 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
1070 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1071 ; GFX10-DL-NEXT: s_and_b32 s0, s2, 15
1072 ; GFX10-DL-NEXT: s_and_b32 s1, s4, 15
1073 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004
1074 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004
1075 ; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008
1076 ; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c
1077 ; GFX10-DL-NEXT: s_bfe_u32 s9, s4, 0x40008
1078 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1079 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
1080 ; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x4000c
1081 ; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010
1082 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2
1083 ; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s8, s0
1084 ; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010
1085 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014
1086 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014
1087 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s9, v2
1088 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3
1089 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2
1090 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
1091 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
1092 ; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018
1093 ; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018
1094 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28
1095 ; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28
1096 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2
1097 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
1098 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2
1099 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2
1100 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
1101 ; GFX10-DL-NEXT: s_endpgm
9571102 <8 x i4> addrspace(1)* %src2,
9581103 i4 addrspace(1)* nocapture %dst) {
9591104 entry:
12091354 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
12101355 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
12111356 ; GFX9-DL-NEXT: s_endpgm
1357 ;
1358 ; GFX10-DL-LABEL: udot8_CommutationInsideMAD:
1359 ; GFX10-DL: ; %bb.0: ; %entry
1360 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1361 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1362 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1363 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1364 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1365 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1366 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1367 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1368 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
1369 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1370 ; GFX10-DL-NEXT: s_and_b32 s0, s2, 15
1371 ; GFX10-DL-NEXT: s_and_b32 s1, s4, 15
1372 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004
1373 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004
1374 ; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008
1375 ; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c
1376 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
1377 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
1378 ; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x4000c
1379 ; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008
1380 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2
1381 ; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s8, s1
1382 ; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010
1383 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014
1384 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014
1385 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s0, v2
1386 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3
1387 ; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010
1388 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2
1389 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2
1390 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
1391 ; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018
1392 ; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018
1393 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28
1394 ; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28
1395 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2
1396 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
1397 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2
1398 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2
1399 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
1400 ; GFX10-DL-NEXT: s_endpgm
12121401 <8 x i4> addrspace(1)* %src2,
12131402 i4 addrspace(1)* nocapture %dst) {
12141403 entry:
14571646 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3
14581647 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
14591648 ; GFX9-DL-NEXT: s_endpgm
1649 ;
1650 ; GFX10-DL-LABEL: udot8_multiuses_mul1:
1651 ; GFX10-DL: ; %bb.0: ; %entry
1652 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1653 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1654 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1655 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1656 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1657 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1658 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1659 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1660 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1661 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1662 ; GFX10-DL-NEXT: s_and_b32 s0, s2, 15
1663 ; GFX10-DL-NEXT: s_and_b32 s1, s4, 15
1664 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5
1665 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004
1666 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004
1667 ; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008
1668 ; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40008
1669 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
1670 ; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c
1671 ; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x4000c
1672 ; GFX10-DL-NEXT: s_bfe_u32 s11, s2, 0x40010
1673 ; GFX10-DL-NEXT: s_bfe_u32 s12, s4, 0x40010
1674 ; GFX10-DL-NEXT: v_mad_u32_u24 v3, s5, s6, v2
1675 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014
1676 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014
1677 ; GFX10-DL-NEXT: s_bfe_u32 s13, s2, 0x40018
1678 ; GFX10-DL-NEXT: s_bfe_u32 s14, s4, 0x40018
1679 ; GFX10-DL-NEXT: v_mad_u32_u24 v3, s7, s8, v3
1680 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28
1681 ; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28
1682 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
1683 ; GFX10-DL-NEXT: v_mad_u32_u24 v3, s9, s10, v3
1684 ; GFX10-DL-NEXT: v_mad_u32_u24 v3, s11, s12, v3
1685 ; GFX10-DL-NEXT: v_mad_u32_u24 v3, s5, s6, v3
1686 ; GFX10-DL-NEXT: v_mad_u32_u24 v3, s13, s14, v3
1687 ; GFX10-DL-NEXT: v_mad_u32_u24 v3, s2, s4, v3
1688 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
1689 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
1690 ; GFX10-DL-NEXT: s_endpgm
14601691 <8 x i4> addrspace(1)* %src2,
14611692 i32 addrspace(1)* nocapture %dst) {
14621693 entry:
16851916 ; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s2, v2, v3
16861917 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
16871918 ; GFX9-DL-NEXT: s_endpgm
1919 ;
1920 ; GFX10-DL-LABEL: udot8_acc32_vecMul:
1921 ; GFX10-DL: ; %bb.0: ; %entry
1922 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1923 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1924 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
1925 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1926 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1927 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1928 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1929 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
1930 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
1931 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
1932 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5
1933 ; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s2, s4, v2
1934 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
1935 ; GFX10-DL-NEXT: s_endpgm
16881936 <8 x i4> addrspace(1)* %src2,
16891937 i32 addrspace(1)* nocapture %dst) {
16901938 entry:
19452193 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
19462194 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
19472195 ; GFX9-DL-NEXT: s_endpgm
2196 ;
2197 ; GFX10-DL-LABEL: udot8_acc16_vecMul:
2198 ; GFX10-DL: ; %bb.0: ; %entry
2199 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2200 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2201 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
2202 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2203 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
2204 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
2205 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
2206 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
2207 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
2208 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2209 ; GFX10-DL-NEXT: s_and_b32 s0, s2, 15
2210 ; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40004
2211 ; GFX10-DL-NEXT: s_and_b32 s5, s4, 15
2212 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004
2213 ; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008
2214 ; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c
2215 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1
2216 ; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40008
2217 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
2218 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x4000c
2219 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s8
2220 ; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x40010
2221 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s5
2222 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s1, s6
2223 ; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40014
2224 ; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x40010
2225 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014
2226 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s7, s0
2227 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s8, s1
2228 ; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018
2229 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
2230 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28
2231 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40018
2232 ; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28
2233 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s2
2234 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2235 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2
2236 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2237 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s1, s5
2238 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s4
2239 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
2240 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2241 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s0, s1
2242 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
2243 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2244 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4
2245 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2246 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
2247 ; GFX10-DL-NEXT: s_endpgm
19482248 <8 x i4> addrspace(1)* %src2,
19492249 i16 addrspace(1)* nocapture %dst) {
19502250 entry:
22452545 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
22462546 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
22472547 ; GFX9-DL-NEXT: s_endpgm
2548 ;
2549 ; GFX10-DL-LABEL: udot8_acc8_vecMul:
2550 ; GFX10-DL: ; %bb.0: ; %entry
2551 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2552 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2553 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
2554 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
2555 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2556 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
2557 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
2558 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
2559 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
2560 ; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off
2561 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2562 ; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40004
2563 ; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x40004
2564 ; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x4000c
2565 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x4000c
2566 ; GFX10-DL-NEXT: s_and_b32 s7, s2, 15
2567 ; GFX10-DL-NEXT: s_and_b32 s9, s4, 15
2568 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s5
2569 ; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x40008
2570 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s1, s6
2571 ; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40008
2572 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s7, s9
2573 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2574 ; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40014
2575 ; GFX10-DL-NEXT: v_and_b32_sdwa v5, v5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2576 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s8, s10
2577 ; GFX10-DL-NEXT: s_lshr_b32 s1, s2, 28
2578 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2579 ; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 28
2580 ; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x40014
2581 ; GFX10-DL-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2582 ; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40010
2583 ; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40010
2584 ; GFX10-DL-NEXT: s_bfe_u32 s2, s2, 0x40018
2585 ; GFX10-DL-NEXT: s_bfe_u32 s4, s4, 0x40018
2586 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2587 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s0, s5
2588 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, s1, s6
2589 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, s7, s8
2590 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v9, s2, s4
2591 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v4
2592 ; GFX10-DL-NEXT: v_and_b32_sdwa v5, v5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2593 ; GFX10-DL-NEXT: v_and_b32_sdwa v2, v11, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2594 ; GFX10-DL-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2595 ; GFX10-DL-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2596 ; GFX10-DL-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2597 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2598 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v6, v4, v3
2599 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v6, v7
2600 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
2601 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2602 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2
2603 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v2
2604 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v4
2605 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2606 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2607 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
2608 ; GFX10-DL-NEXT: s_endpgm
22482609 <8 x i4> addrspace(1)* %src2,
22492610 i8 addrspace(1)* nocapture %dst) {
22502611 entry:
24802841 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
24812842 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
24822843 ; GFX9-DL-NEXT: s_endpgm
2844 ;
2845 ; GFX10-DL-LABEL: udot8_acc4_vecMul:
2846 ; GFX10-DL: ; %bb.0: ; %entry
2847 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2848 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2849 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
2850 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2851 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
2852 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
2853 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
2854 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
2855 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
2856 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
2857 ; GFX10-DL-NEXT: s_and_b32 s0, s2, 15
2858 ; GFX10-DL-NEXT: s_and_b32 s1, s4, 15
2859 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004
2860 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004
2861 ; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008
2862 ; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c
2863 ; GFX10-DL-NEXT: s_bfe_u32 s9, s4, 0x40008
2864 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
2865 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
2866 ; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x4000c
2867 ; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010
2868 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2
2869 ; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s8, s0
2870 ; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010
2871 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014
2872 ; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014
2873 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s9, v2
2874 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3
2875 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2
2876 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
2877 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
2878 ; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018
2879 ; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018
2880 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28
2881 ; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28
2882 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2
2883 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
2884 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2
2885 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2
2886 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
2887 ; GFX10-DL-NEXT: s_endpgm
24832888 <8 x i4> addrspace(1)* %src2,
24842889 i4 addrspace(1)* nocapture %dst) {
24852890 entry:
26683073 ; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s3, v2, v3
26693074 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
26703075 ; GFX9-DL-NEXT: s_endpgm
3076 ;
3077 ; GFX10-DL-LABEL: udot8_variant1:
3078 ; GFX10-DL: ; %bb.0: ; %entry
3079 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3080 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3081 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
3082 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
3083 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
3084 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
3085 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
3086 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
3087 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
3088 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
3089 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4
3090 ; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s3, s2, v2
3091 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
3092 ; GFX10-DL-NEXT: s_endpgm
26713093 i32 addrspace(1)* %v2addr,
26723094 i32 addrspace(1)* %dst) {
26733095 entry:
0 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
1
2 ; GCN-LABEL: _amdgpu_hs_main:
3
4 define amdgpu_hs void @_amdgpu_hs_main() #0 {
5 .entry:
6 ret void
7 }
8
9 ; GCN-LABEL: _amdgpu_ps_main:
10 ; GCN: s_and_saveexec_b64
11
12 define amdgpu_ps void @_amdgpu_ps_main(i32 %arg) local_unnamed_addr #1 {
13 .entry:
14 %tmp = tail call float @llvm.amdgcn.interp.p2(float undef, float undef, i32 1, i32 0, i32 %arg) #2
15 %tmp1 = tail call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float undef, float %tmp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
16 %tmp2 = fcmp olt float %tmp1, 5.000000e-01
17 br i1 %tmp2, label %bb, label %l
18
19 bb: ; preds = %.entry
20 unreachable
21
22 l: ; preds = %.entry
23 ret void
24 }
25
26 ; GCN-LABEL: _amdgpu_gs_main:
27
28 define amdgpu_gs void @_amdgpu_gs_main() #4 {
29 .entry:
30 ret void
31 }
32
33 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
34 declare float @llvm.amdgcn.image.sample.2d.f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
35
36 attributes #0 = { "amdgpu-max-work-group-size"="128" "target-features"=",+wavefrontsize32" }
37 attributes #1 = { "target-features"=",+wavefrontsize64" }
38 attributes #2 = { nounwind readnone speculatable }
39 attributes #3 = { nounwind readonly }
40 attributes #4 = { "target-features"=",+wavefrontsize32" }
0 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
1 ;
2 ; Check that PS is wave64
3 ; GFX10-LABEL: _amdgpu_ps_main:
4 ; GFX10: s_and_saveexec_b64
5 ;
6 ; Check that VS is wave32
7 ; GFX10-LABEL: _amdgpu_vs_main:
8 ; GFX10: s_and_saveexec_b32
9 ;
10 ; Check that GS is wave32
11 ; GFX10-LABEL: _amdgpu_gs_main:
12 ; GFX10: s_and_saveexec_b32
13 ;
14 ; Check that HS is wave32
15 ; GFX10-LABEL: _amdgpu_hs_main:
16 ; GFX10: s_and_saveexec_b32
17 ;
18 ; Check that CS is wave32
19 ; GFX10-LABEL: _amdgpu_cs_main:
20 ; GFX10: s_and_saveexec_b32
21 ;
22 ; Check that:
23 ; PS_W32_EN (bit 15) of SPI_PS_IN_CONTROL (0xa1b6) is 0;
24 ; VS_W32_EN (bit 23) of VGT_SHADER_STAGES_EN (0xa2d5) is 1;
25 ; GS_W32_EN (bit 22) of VGT_SHADER_STAGES_EN (0xa2d5) is 1;
26 ; HS_W32_EN (bit 21) of VGT_SHADER_STAGES_EN (0xa2d5) is 1;
27 ; CS_W32_EN (bit 15) of COMPUTE_DISPATCH_INITIATOR (0x2e00) is 1.
28 ;
29 ; GFX10: .amd_amdgpu_pal_metadata{{.*}},0x2e00,0x8000,{{.*}}0xa1b6,0x1,{{.*}},0xa2d5,0xe00000,
30
31 define dllexport amdgpu_ps void @_amdgpu_ps_main(float %arg10) #0 {
32 .entry:
33 %tmp100 = fcmp ogt float %arg10, 0.25
34 br i1 %tmp100, label %if, label %endif
35 if:
36 %tmp101 = fadd float %arg10, 0.125
37 br label %endif
38 endif:
39 %tmp102 = phi float [ %arg10, %.entry ], [ %tmp101, %if ]
40 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp102, float %tmp102, float %tmp102, float %tmp102, i1 true, i1 true)
41 ret void
42 }
43
44 define dllexport amdgpu_vs void @_amdgpu_vs_main(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, float %arg10) local_unnamed_addr #2 {
45 .entry:
46 %tmp100 = fcmp ogt float %arg10, 0.25
47 br i1 %tmp100, label %if, label %endif
48 if:
49 %tmp101 = fadd float %arg10, 0.125
50 br label %endif
51 endif:
52 %tmp102 = phi float [ %arg10, %.entry ], [ %tmp101, %if ]
53 call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp102, float %tmp102, float %tmp102, float %tmp102, i1 false, i1 false)
54 ret void
55 }
56
57 define dllexport amdgpu_gs void @_amdgpu_gs_main(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, float %arg10) local_unnamed_addr #2 {
58 .entry:
59 %tmp100 = fcmp ogt float %arg10, 0.25
60 br i1 %tmp100, label %if, label %endif
61 if:
62 %tmp101 = fadd float %arg10, 0.125
63 br label %endif
64 endif:
65 %tmp102 = phi float [ %arg10, %.entry ], [ %tmp101, %if ]
66 call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp102, float %tmp102, float %tmp102, float %tmp102, i1 false, i1 false)
67 ret void
68 }
69
70 define dllexport amdgpu_hs void @_amdgpu_hs_main(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, float %arg10) local_unnamed_addr #2 {
71 .entry:
72 %tmp100 = fcmp ogt float %arg10, 0.25
73 br i1 %tmp100, label %if, label %endif
74 if:
75 %tmp101 = fadd float %arg10, 0.125
76 br label %endif
77 endif:
78 %tmp102 = phi float [ %arg10, %.entry ], [ %tmp101, %if ]
79 call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp102, float %tmp102, float %tmp102, float %tmp102, i1 false, i1 false)
80 ret void
81 }
82
83 define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, float %arg10) local_unnamed_addr #2 {
84 .entry:
85 %tmp100 = fcmp ogt float %arg10, 0.25
86 br i1 %tmp100, label %if, label %endif
87 if:
88 %tmp101 = fadd float %arg10, 0.125
89 br label %endif
90 endif:
91 %tmp102 = phi float [ %arg10, %.entry ], [ %tmp101, %if ]
92 call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp102, float %tmp102, float %tmp102, float %tmp102, i1 false, i1 false)
93 ret void
94 }
95
96 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #2
97
98 attributes #0 = { nounwind "InitialPSInputAddr"="2" "target-features"="+wavefrontsize64" }
99 attributes #1 = { nounwind readnone speculatable }
100 attributes #2 = { nounwind "target-features"="+wavefrontsize32" }
101 attributes #3 = { nounwind readonly }
102
103 !amdgpu.pal.metadata = !{!8}
104
105 !6 = !{}
106 !8 = !{i32 268435482, i32 1, i32 268435488, i32 -1, i32 268435480, i32 -322237066, i32 268435481, i32 717283096, i32 268435538, i32 4096, i32 268435539, i32 8192, i32 11338, i32 53215232, i32 11339, i32 10, i32 41411, i32 4, i32 41393, i32 0, i32 41479, i32 0, i32 41476, i32 17301504, i32 41478, i32 1087, i32 41721, i32 45, i32 41633, i32 0, i32 41702, i32 0, i32 41653, i32 0, i32 41657, i32 0, i32 41661, i32 0, i32 41665, i32 0, i32 41645, i32 0, i32 41750, i32 14, i32 268435528, i32 0, i32 268435493, i32 0, i32 268435500, i32 0, i32 268435536, i32 0, i32 11274, i32 2883584, i32 11275, i32 4, i32 41412, i32 0, i32 41413, i32 4, i32 41400, i32 16777216, i32 41398, i32 1, i32 41395, i32 0, i32 41396, i32 0, i32 41397, i32 0, i32 41619, i32 100794764, i32 41475, i32 16, i32 41103, i32 15, i32 268435485, i32 0, i32 268435529, i32 0, i32 268435494, i32 0, i32 268435501, i32 0, i32 41685, i32 0, i32 268435460, i32 -431267536, i32 268435461, i32 -366377628, i32 268435476, i32 352863062, i32 268435477, i32 1678737839, i32 268435532, i32 1, i32 41642, i32 127, i32 11343, i32 268435459, i32 11344, i32 268435460, i32 11340, i32 268435456, i32 11342, i32 0, i32 41361, i32 0, i32 11276, i32 268435456}
None ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s
1 ; RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=CHECK-O0
0 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64
1 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W32
2 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64
3 ; RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info -o - %s | FileCheck %s --check-prefix=W64-O0
24
35 ; Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
46
5 ; CHECK-LABEL: mubuf_vgpr
6 ; CHECK: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
7 ; CHECK: [[LOOPBB:BB[0-9]+_[0-9]+]]:
8 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
9 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
10 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
11 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
12 ; CHECK: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
13 ; CHECK: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
14 ; CHECK: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
15 ; CHECK: s_and_saveexec_b64 [[CMP]], [[CMP]]
16 ; CHECK: s_waitcnt vmcnt(0)
17 ; CHECK: buffer_load_format_x [[RES:v[0-9]+]], v4, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
18 ; CHECK: s_xor_b64 exec, exec, [[CMP]]
19 ; CHECK: s_cbranch_execnz [[LOOPBB]]
20 ; CHECK: s_mov_b64 exec, [[SAVEEXEC]]
21 ; CHECK: v_mov_b32_e32 v0, [[RES]]
7 ; W64-LABEL: mubuf_vgpr
8 ; W64: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
9 ; W64: [[LOOPBB:BB[0-9]+_[0-9]+]]:
10 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
11 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
12 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
13 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
14 ; W64: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
15 ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
16 ; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
17 ; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
18 ; W64: s_waitcnt vmcnt(0)
19 ; W64: buffer_load_format_x [[RES:v[0-9]+]], v4, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
20 ; W64: s_xor_b64 exec, exec, [[CMP]]
21 ; W64: s_cbranch_execnz [[LOOPBB]]
22 ; W64: s_mov_b64 exec, [[SAVEEXEC]]
23 ; W64: v_mov_b32_e32 v0, [[RES]]
24
25 ; W32-LABEL: mubuf_vgpr
26 ; W32: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo
27 ; W32: [[LOOPBB:BB[0-9]+_[0-9]+]]:
28 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
29 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
30 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
31 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
32 ; W32: v_cmp_eq_u64_e32 vcc_lo, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
33 ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
34 ; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
35 ; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
36 ; W32: s_waitcnt vmcnt(0)
37 ; W32: buffer_load_format_x [[RES:v[0-9]+]], v4, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
38 ; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
39 ; W32: s_cbranch_execnz [[LOOPBB]]
40 ; W32: s_mov_b32 exec_lo, [[SAVEEXEC]]
41 ; W32: v_mov_b32_e32 v0, [[RES]]
42
2243 define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
2344 %call = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %i, i32 %c, i32 0, i1 zeroext false, i1 zeroext false) #1
2445 ret float %call
2546 }
2647
27 ; CHECK-LABEL: mubuf_vgpr_adjacent_in_block
28
29 ; CHECK: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
30 ; CHECK: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
31 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
32 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
33 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
34 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
35 ; CHECK: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
36 ; CHECK: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
37 ; CHECK: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
38 ; CHECK: s_and_saveexec_b64 [[CMP]], [[CMP]]
39 ; CHECK: s_waitcnt vmcnt(0)
40 ; CHECK: buffer_load_format_x [[RES0:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
41 ; CHECK: s_xor_b64 exec, exec, [[CMP]]
42 ; CHECK: s_cbranch_execnz [[LOOPBB0]]
43
44 ; CHECK: s_mov_b64 exec, [[SAVEEXEC]]
48
49 ; W64-LABEL: mubuf_vgpr_adjacent_in_block
50
51 ; W64: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
52 ; W64: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
53 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
54 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
55 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
56 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
57 ; W64: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
58 ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
59 ; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
60 ; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
61 ; W64: s_waitcnt vmcnt(0)
62 ; W64: buffer_load_format_x [[RES0:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
63 ; W64: s_xor_b64 exec, exec, [[CMP]]
64 ; W64: s_cbranch_execnz [[LOOPBB0]]
65
66 ; W64: s_mov_b64 exec, [[SAVEEXEC]]
4567 ; FIXME: redundant s_mov
46 ; CHECK: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
47
48 ; CHECK: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
49 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v4
50 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v5
51 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v6
52 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v7
53 ; CHECK: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[4:5]
54 ; CHECK: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
55 ; CHECK: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
56 ; CHECK: s_and_saveexec_b64 [[CMP]], [[CMP]]
57 ; CHECK: s_waitcnt vmcnt(0)
58 ; CHECK: buffer_load_format_x [[RES1:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
59 ; CHECK: s_xor_b64 exec, exec, [[CMP]]
60 ; CHECK: s_cbranch_execnz [[LOOPBB1]]
61
62 ; CHECK: s_mov_b64 exec, [[SAVEEXEC]]
63 ; CHECK-DAG: global_store_dword v[9:10], [[RES0]], off
64 ; CHECK-DAG: global_store_dword v[11:12], [[RES1]], off
68 ; W64: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
69
70 ; W64: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
71 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v4
72 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v5
73 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v6
74 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v7
75 ; W64: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[4:5]
76 ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
77 ; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
78 ; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
79 ; W64: s_waitcnt vmcnt(0)
80 ; W64: buffer_load_format_x [[RES1:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
81 ; W64: s_xor_b64 exec, exec, [[CMP]]
82 ; W64: s_cbranch_execnz [[LOOPBB1]]
83
84 ; W64: s_mov_b64 exec, [[SAVEEXEC]]
85 ; W64-DAG: global_store_dword v[9:10], [[RES0]], off
86 ; W64-DAG: global_store_dword v[11:12], [[RES1]], off
87
88
89 ; W32-LABEL: mubuf_vgpr_adjacent_in_block
90
91 ; W32: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo
92 ; W32: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
93 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
94 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
95 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
96 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
97 ; W32: v_cmp_eq_u64_e32 vcc_lo, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
98 ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
99 ; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
100 ; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
101 ; W32: s_waitcnt vmcnt(0)
102 ; W32: buffer_load_format_x [[RES0:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
103 ; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
104 ; W32: s_cbranch_execnz [[LOOPBB0]]
105
106 ; W32: s_mov_b32 exec_lo, [[SAVEEXEC]]
107 ; FIXME: redundant s_mov
108 ; W32: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo
109
110 ; W32: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
111 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v4
112 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v5
113 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v6
114 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v7
115 ; W32: v_cmp_eq_u64_e32 vcc_lo, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[4:5]
116 ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
117 ; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
118 ; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
119 ; W32: s_waitcnt vmcnt(0)
120 ; W32: buffer_load_format_x [[RES1:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
121 ; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
122 ; W32: s_cbranch_execnz [[LOOPBB1]]
123
124 ; W32: s_mov_b32 exec_lo, [[SAVEEXEC]]
125 ; W32-DAG: global_store_dword v[9:10], [[RES0]], off
126 ; W32-DAG: global_store_dword v[11:12], [[RES1]], off
65127
66128 define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, float addrspace(1)* %out0, float addrspace(1)* %out1) #0 {
67129 entry:
72134 ret void
73135 }
74136
75 ; CHECK-LABEL: mubuf_vgpr_outside_entry
76
77 ; CHECK-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4
78 ; CHECK-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
79
80 ; CHECK: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
81 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
82 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
83 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
84 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
85 ; CHECK: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
86 ; CHECK: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
87 ; CHECK: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
88 ; CHECK: s_and_saveexec_b64 [[CMP]], [[CMP]]
89 ; CHECK: s_waitcnt vmcnt(0)
90 ; CHECK: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
91 ; CHECK: s_xor_b64 exec, exec, [[CMP]]
92 ; CHECK: s_cbranch_execnz [[LOOPBB0]]
93
94 ; CHECK: s_mov_b64 exec, [[SAVEEXEC]]
95 ; CHECK: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]]
96
97 ; CHECK: BB{{[0-9]+_[0-9]+}}:
98 ; CHECK-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4
99 ; CHECK-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
100
101 ; CHECK: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
102 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v4
103 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v5
104 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v6
105 ; CHECK-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v7
106 ; CHECK: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[4:5]
107 ; CHECK: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
108 ; CHECK: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
109 ; CHECK: s_and_saveexec_b64 [[CMP]], [[CMP]]
110 ; CHECK: s_waitcnt vmcnt(0)
111 ; CHECK: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
112 ; CHECK: s_xor_b64 exec, exec, [[CMP]]
113 ; CHECK: s_cbranch_execnz [[LOOPBB1]]
114
115 ; CHECK: s_mov_b64 exec, [[SAVEEXEC]]
116
117 ; CHECK: [[TERMBB]]:
118 ; CHECK: global_store_dword v[11:12], [[RES]], off
137
138 ; W64-LABEL: mubuf_vgpr_outside_entry
139
140 ; W64-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4
141 ; W64-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
142
143 ; W64: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
144 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
145 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
146 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
147 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
148 ; W64: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
149 ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
150 ; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
151 ; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
152 ; W64: s_waitcnt vmcnt(0)
153 ; W64: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
154 ; W64: s_xor_b64 exec, exec, [[CMP]]
155 ; W64: s_cbranch_execnz [[LOOPBB0]]
156
157 ; W64: s_mov_b64 exec, [[SAVEEXEC]]
158 ; W64: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]]
159
160 ; W64: BB{{[0-9]+_[0-9]+}}:
161 ; W64-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4
162 ; W64-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
163
164 ; W64: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
165 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v4
166 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v5
167 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v6
168 ; W64-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v7
169 ; W64: v_cmp_eq_u64_e32 vcc, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[4:5]
170 ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
171 ; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]]
172 ; W64: s_and_saveexec_b64 [[CMP]], [[CMP]]
173 ; W64: s_waitcnt vmcnt(0)
174 ; W64: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
175 ; W64: s_xor_b64 exec, exec, [[CMP]]
176 ; W64: s_cbranch_execnz [[LOOPBB1]]
177
178 ; W64: s_mov_b64 exec, [[SAVEEXEC]]
179
180 ; W64: [[TERMBB]]:
181 ; W64: global_store_dword v[11:12], [[RES]], off
182
183
184 ; W32-LABEL: mubuf_vgpr_outside_entry
185
186 ; W32-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4
187 ; W32-DAG: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo
188
189 ; W32: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
190 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v0
191 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v1
192 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v2
193 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v3
194 ; W32: v_cmp_eq_u64_e32 vcc_lo, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[0:1]
195 ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3]
196 ; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
197 ; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
198 ; W32: s_waitcnt vmcnt(0)
199 ; W32: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
200 ; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
201 ; W32: s_cbranch_execnz [[LOOPBB0]]
202
203 ; W32: s_mov_b32 exec_lo, [[SAVEEXEC]]
204 ; W32: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]]
205
206 ; W32: BB{{[0-9]+_[0-9]+}}:
207 ; W32-DAG: v_mov_b32_e32 [[IDX:v[0-9]+]], s4
208 ; W32-DAG: s_mov_b32 [[SAVEEXEC:s[0-9]+]], exec_lo
209
210 ; W32: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
211 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC0:[0-9]+]], v4
212 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC1:[0-9]+]], v5
213 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC2:[0-9]+]], v6
214 ; W32-DAG: v_readfirstlane_b32 s[[SRSRC3:[0-9]+]], v7
215 ; W32: v_cmp_eq_u64_e32 vcc_lo, s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v[4:5]
216 ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7]
217 ; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]]
218 ; W32: s_and_saveexec_b32 [[CMP]], [[CMP]]
219 ; W32: s_waitcnt vmcnt(0)
220 ; W32: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen
221 ; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]]
222 ; W32: s_cbranch_execnz [[LOOPBB1]]
223
224 ; W32: s_mov_b32 exec_lo, [[SAVEEXEC]]
225
226 ; W32: [[TERMBB]]:
227 ; W32: global_store_dword v[11:12], [[RES]], off
228
119229
120230 ; Confirm spills do not occur between the XOR and branch that terminate the
121231 ; waterfall loop BBs.
122232
123 ; CHECK-O0-LABEL: mubuf_vgpr_outside_entry
124
125 ; CHECK-O0-DAG: s_mov_b32 [[IDX_S:s[0-9]+]], s4
126 ; CHECK-O0-DAG: v_mov_b32_e32 [[IDX_V:v[0-9]+]], [[IDX_S]]
127 ; CHECK-O0-DAG: s_mov_b64 s{{\[}}[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]{{\]}}, exec
128 ; CHECK-O0-DAG: buffer_store_dword [[IDX_V]], off, s[0:3], s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill
129 ; CHECK-O0-DAG: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]]
130 ; CHECK-O0-DAG: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]]
131
132 ; CHECK-O0: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
133 ; CHECK-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], {{.*}} ; 4-byte Folded Reload
134 ; CHECK-O0: s_waitcnt vmcnt(0)
135 ; CHECK-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], {{.*}} ; 4-byte Folded Reload
136 ; CHECK-O0: s_waitcnt vmcnt(0)
137 ; CHECK-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], {{.*}} ; 4-byte Folded Reload
138 ; CHECK-O0: s_waitcnt vmcnt(0)
139 ; CHECK-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], {{.*}} ; 4-byte Folded Reload
140 ; CHECK-O0: s_waitcnt vmcnt(0)
141 ; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP0:[0-9]+]], v[[VRSRC0]]
142 ; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]]
143 ; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP2:[0-9]+]], v[[VRSRC2]]
144 ; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP3:[0-9]+]], v[[VRSRC3]]
145 ; CHECK-O0-DAG: s_mov_b32 s[[SRSRC0:[0-9]+]], s[[SRSRCTMP0]]
146 ; CHECK-O0-DAG: s_mov_b32 s[[SRSRC1:[0-9]+]], s[[SRSRCTMP1]]
147 ; CHECK-O0-DAG: s_mov_b32 s[[SRSRC2:[0-9]+]], s[[SRSRCTMP2]]
148 ; CHECK-O0-DAG: s_mov_b32 s[[SRSRC3:[0-9]+]], s[[SRSRCTMP3]]
149 ; CHECK-O0: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
150 ; CHECK-O0: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
151 ; CHECK-O0: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
152 ; CHECK-O0: s_and_saveexec_b64 [[CMP]], [[CMP]]
153 ; CHECK-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s[0:3], s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload
154 ; CHECK-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, {{.*}} idxen
155 ; CHECK-O0: s_waitcnt vmcnt(0)
156 ; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
157 ; CHECK-O0: s_xor_b64 exec, exec, [[CMP]]
158 ; CHECK-O0-NEXT: s_cbranch_execnz [[LOOPBB0]]
159
160 ; CHECK-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]]
161 ; CHECK-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]]
162 ; CHECK-O0: s_mov_b64 exec, s{{\[}}[[SAVEEXEC0]]:[[SAVEEXEC1]]{{\]}}
163 ; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload
164 ; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF:[0-9]+]] ; 4-byte Folded Spill
165 ; CHECK-O0: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]]
166
167 ; CHECK-O0: BB{{[0-9]+_[0-9]+}}:
168 ; CHECK-O0-DAG: s_mov_b64 s{{\[}}[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]{{\]}}, exec
169 ; CHECK-O0-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill
170 ; CHECK-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]]
171 ; CHECK-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]]
172
173 ; CHECK-O0: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
174 ; CHECK-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], {{.*}} ; 4-byte Folded Reload
175 ; CHECK-O0: s_waitcnt vmcnt(0)
176 ; CHECK-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], {{.*}} ; 4-byte Folded Reload
177 ; CHECK-O0: s_waitcnt vmcnt(0)
178 ; CHECK-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], {{.*}} ; 4-byte Folded Reload
179 ; CHECK-O0: s_waitcnt vmcnt(0)
180 ; CHECK-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], {{.*}} ; 4-byte Folded Reload
181 ; CHECK-O0: s_waitcnt vmcnt(0)
182 ; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP0:[0-9]+]], v[[VRSRC0]]
183 ; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]]
184 ; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP2:[0-9]+]], v[[VRSRC2]]
185 ; CHECK-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP3:[0-9]+]], v[[VRSRC3]]
186 ; CHECK-O0-DAG: s_mov_b32 s[[SRSRC0:[0-9]+]], s[[SRSRCTMP0]]
187 ; CHECK-O0-DAG: s_mov_b32 s[[SRSRC1:[0-9]+]], s[[SRSRCTMP1]]
188 ; CHECK-O0-DAG: s_mov_b32 s[[SRSRC2:[0-9]+]], s[[SRSRCTMP2]]
189 ; CHECK-O0-DAG: s_mov_b32 s[[SRSRC3:[0-9]+]], s[[SRSRCTMP3]]
190 ; CHECK-O0: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
191 ; CHECK-O0: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
192 ; CHECK-O0: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
193 ; CHECK-O0: s_and_saveexec_b64 [[CMP]], [[CMP]]
194 ; CHECK-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s[0:3], s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload
195 ; CHECK-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, {{.*}} idxen
196 ; CHECK-O0: s_waitcnt vmcnt(0)
197 ; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
198 ; CHECK-O0: s_xor_b64 exec, exec, [[CMP]]
199 ; CHECK-O0-NEXT: s_cbranch_execnz [[LOOPBB1]]
200
201 ; CHECK-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]]
202 ; CHECK-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]]
203 ; CHECK-O0: s_mov_b64 exec, s{{\[}}[[SAVEEXEC0]]:[[SAVEEXEC1]]{{\]}}
204 ; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload
205 ; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF]] ; 4-byte Folded Spill
206
207 ; CHECK-O0: [[TERMBB]]:
208 ; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF]] ; 4-byte Folded Reload
209 ; CHECK-O0: global_store_dword v[{{[0-9]+:[0-9]+}}], [[RES]], off
233 ; W64-O0-LABEL: mubuf_vgpr_outside_entry
234
235 ; W64-O0-DAG: s_mov_b32 [[IDX_S:s[0-9]+]], s4
236 ; W64-O0-DAG: v_mov_b32_e32 [[IDX_V:v[0-9]+]], [[IDX_S]]
237 ; W64-O0-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
238 ; W64-O0-DAG: buffer_store_dword [[IDX_V]], off, s[0:3], s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill
239
240 ; W64-O0: [[LOOPBB0:BB[0-9]+_[0-9]+]]:
241 ; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], {{.*}} ; 4-byte Folded Reload
242 ; W64-O0: s_waitcnt vmcnt(0)
243 ; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], {{.*}} ; 4-byte Folded Reload
244 ; W64-O0: s_waitcnt vmcnt(0)
245 ; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], {{.*}} ; 4-byte Folded Reload
246 ; W64-O0: s_waitcnt vmcnt(0)
247 ; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], {{.*}} ; 4-byte Folded Reload
248 ; W64-O0: s_waitcnt vmcnt(0)
249 ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP0:[0-9]+]], v[[VRSRC0]]
250 ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]]
251 ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP2:[0-9]+]], v[[VRSRC2]]
252 ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP3:[0-9]+]], v[[VRSRC3]]
253 ; W64-O0-DAG: s_mov_b32 s[[SRSRC0:[0-9]+]], s[[SRSRCTMP0]]
254 ; W64-O0-DAG: s_mov_b32 s[[SRSRC1:[0-9]+]], s[[SRSRCTMP1]]
255 ; W64-O0-DAG: s_mov_b32 s[[SRSRC2:[0-9]+]], s[[SRSRCTMP2]]
256 ; W64-O0-DAG: s_mov_b32 s[[SRSRC3:[0-9]+]], s[[SRSRCTMP3]]
257 ; W64-O0: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
258 ; W64-O0: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
259 ; W64-O0: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
260 ; W64-O0: s_and_saveexec_b64 [[CMP]], [[CMP]]
261 ; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s[0:3], s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload
262 ; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, {{.*}} idxen
263 ; W64-O0: s_waitcnt vmcnt(0)
264 ; W64-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
265 ; W64-O0: s_xor_b64 exec, exec, [[CMP]]
266 ; W64-O0-NEXT: s_cbranch_execnz [[LOOPBB0]]
267 ; CHECK-O0: s_mov_b64 exec, [[SAVEEXEC]]
268 ; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload
269 ; W64-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF:[0-9]+]] ; 4-byte Folded Spill
270 ; W64-O0: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]]
271
272 ; W64-O0: BB{{[0-9]+_[0-9]+}}:
273 ; W64-O0-DAG: s_mov_b64 s{{\[}}[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]{{\]}}, exec
274 ; W64-O0-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill
275 ; W64-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]]
276 ; W64-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]]
277
278 ; W64-O0: [[LOOPBB1:BB[0-9]+_[0-9]+]]:
279 ; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], {{.*}} ; 4-byte Folded Reload
280 ; W64-O0: s_waitcnt vmcnt(0)
281 ; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], {{.*}} ; 4-byte Folded Reload
282 ; W64-O0: s_waitcnt vmcnt(0)
283 ; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], {{.*}} ; 4-byte Folded Reload
284 ; W64-O0: s_waitcnt vmcnt(0)
285 ; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], {{.*}} ; 4-byte Folded Reload
286 ; W64-O0: s_waitcnt vmcnt(0)
287 ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP0:[0-9]+]], v[[VRSRC0]]
288 ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]]
289 ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP2:[0-9]+]], v[[VRSRC2]]
290 ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP3:[0-9]+]], v[[VRSRC3]]
291 ; W64-O0-DAG: s_mov_b32 s[[SRSRC0:[0-9]+]], s[[SRSRCTMP0]]
292 ; W64-O0-DAG: s_mov_b32 s[[SRSRC1:[0-9]+]], s[[SRSRCTMP1]]
293 ; W64-O0-DAG: s_mov_b32 s[[SRSRC2:[0-9]+]], s[[SRSRCTMP2]]
294 ; W64-O0-DAG: s_mov_b32 s[[SRSRC3:[0-9]+]], s[[SRSRCTMP3]]
295 ; W64-O0: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC0]]:[[SRSRC1]]{{\]}}, v{{\[}}[[VRSRC0]]:[[VRSRC1]]{{\]}}
296 ; W64-O0: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}}
297 ; W64-O0: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
298 ; W64-O0: s_and_saveexec_b64 [[CMP]], [[CMP]]
299 ; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s[0:3], s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload
300 ; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, {{.*}} idxen
301 ; W64-O0: s_waitcnt vmcnt(0)
302 ; W64-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
303 ; W64-O0: s_xor_b64 exec, exec, [[CMP]]
304 ; W64-O0-NEXT: s_cbranch_execnz [[LOOPBB1]]
305
306 ; W64-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]]
307 ; W64-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]]
308 ; W64-O0: s_mov_b64 exec, s{{\[}}[[SAVEEXEC0]]:[[SAVEEXEC1]]{{\]}}
309 ; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload
310 ; W64-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF]] ; 4-byte Folded Spill
311
312 ; W64-O0: [[TERMBB]]:
313 ; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF]] ; 4-byte Folded Reload
314 ; W64-O0: global_store_dword v[{{[0-9]+:[0-9]+}}], [[RES]], off
210315
211316 define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, float addrspace(1)* %in, float addrspace(1)* %out) #0 {
212317 entry:
0 # RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass=si-optimize-exec-masking-pre-ra -o - %s | FileCheck -check-prefix=GCN %s
1
2 # GCN: name: negated_cond_vop2
3 # GCN: %0:sgpr_32 = IMPLICIT_DEF
4 # GCN-NEXT: $vcc_lo = S_ANDN2_B32 $exec_lo, %0, implicit-def $scc
5 # GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
6 ---
7 name: negated_cond_vop2
8 body: |
9 bb.0:
10 %0:sgpr_32 = IMPLICIT_DEF
11 %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
12 V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
13 $vcc_lo = S_AND_B32 $exec_lo, killed $vcc_lo, implicit-def dead $scc
14 S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
15 S_BRANCH %bb.1
16
17 bb.1:
18 S_BRANCH %bb.0
19
20 bb.2:
21 S_ENDPGM 0
22 ...
23
24 # GCN: name: negated_cond_vop3
25 # GCN: %0:sgpr_32 = IMPLICIT_DEF
26 # GCN-NEXT: $vcc_lo = S_ANDN2_B32 $exec_lo, %0, implicit-def $scc
27 # GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
28 ---
29 name: negated_cond_vop3
30 body: |
31 bb.0:
32 %0:sgpr_32 = IMPLICIT_DEF
33 %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
34 %2:sgpr_32 = V_CMP_NE_U32_e64 %1, 1, implicit $exec
35 $vcc_lo = S_AND_B32 killed %2, $exec_lo, implicit-def dead $scc
36 S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
37 S_BRANCH %bb.1
38
39 bb.1:
40 S_BRANCH %bb.0
41
42 bb.2:
43 S_ENDPGM 0
44 ...
45
46 # GCN: name: negated_cond_vop2_redef_vcc1
47 # GCN: %0:sgpr_32 = IMPLICIT_DEF
48 # GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
49 # GCN-NEXT: V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
50 # GCN-NEXT: $vcc_lo = COPY $sgpr0
51 # GCN-NEXT: $vcc_lo = S_AND_B32 $exec_lo, $vcc_lo, implicit-def dead $scc
52 # GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
53 ---
54 name: negated_cond_vop2_redef_vcc1
55 body: |
56 bb.0:
57 %0:sgpr_32 = IMPLICIT_DEF
58 %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
59 V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
60 $vcc_lo = COPY $sgpr0
61 $vcc_lo = S_AND_B32 $exec_lo, killed $vcc_lo, implicit-def dead $scc
62 S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
63 S_BRANCH %bb.1
64
65 bb.1:
66 S_BRANCH %bb.0
67
68 bb.2:
69 S_ENDPGM 0
70 ...
71
72 # GCN: name: negated_cond_vop3_redef_cmp
73 # GCN: %0:sgpr_32 = IMPLICIT_DEF
74 # GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
75 # GCN-NEXT: %2:sgpr_32 = V_CMP_NE_U32_e64 %1, 1, implicit $exec
76 # GCN-NEXT: %2:sgpr_32 = COPY $sgpr0
77 # GCN-NEXT: $vcc_lo = S_AND_B32 %2, $exec_lo, implicit-def dead $scc
78 # GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
79 ---
80 name: negated_cond_vop3_redef_cmp
81 body: |
82 bb.0:
83 %0:sgpr_32 = IMPLICIT_DEF
84 %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
85 %2:sgpr_32 = V_CMP_NE_U32_e64 %1, 1, implicit $exec
86 %2 = COPY $sgpr0
87 $vcc_lo = S_AND_B32 killed %2, $exec_lo, implicit-def dead $scc
88 S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
89 S_BRANCH %bb.1
90
91 bb.1:
92 S_BRANCH %bb.0
93
94 bb.2:
95 S_ENDPGM 0
96 ...
97
98 # GCN: name: negated_cond_undef_vcc
99 # GCN: $vcc_lo = S_AND_B32 $exec_lo, undef $vcc_lo, implicit-def dead $scc
100 # GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
101 ---
102 name: negated_cond_undef_vcc
103 body: |
104 bb.0:
105 $vcc_lo = S_AND_B32 $exec_lo, undef $vcc_lo, implicit-def dead $scc
106 S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
107 S_BRANCH %bb.1
108
109 bb.1:
110 S_BRANCH %bb.0
111
112 bb.2:
113 S_ENDPGM 0
114 ...
115
116 # GCN: name: negated_cond_vop3_imp_vcc
117 # GCN: $vcc_lo = IMPLICIT_DEF
118 # GCN-NEXT: $vcc_lo = S_ANDN2_B32 $exec_lo, $vcc_lo, implicit-def $scc
119 # GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
120 ---
121 name: negated_cond_vop3_imp_vcc
122 body: |
123 bb.0:
124 $vcc_lo = IMPLICIT_DEF
125 %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $vcc_lo, implicit $exec
126 %2:sgpr_32 = V_CMP_NE_U32_e64 %1, 1, implicit $exec
127 $vcc_lo = S_AND_B32 killed %2, $exec_lo, implicit-def dead $scc
128 S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
129 S_BRANCH %bb.1
130
131 bb.1:
132 S_BRANCH %bb.0
133
134 bb.2:
135 S_ENDPGM 0
136 ...
137
138 # GCN: name: negated_cond_vop2_imp_vcc
139 # GCN: $vcc_lo = IMPLICIT_DEF
140 # GCN-NEXT: $vcc_lo = S_ANDN2_B32 $exec_lo, $vcc_lo, implicit-def $scc
141 # GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
142 ---
143 name: negated_cond_vop2_imp_vcc
144 body: |
145 bb.0:
146 $vcc_lo = IMPLICIT_DEF
147 %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $vcc_lo, implicit $exec
148 V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
149 $vcc_lo = S_AND_B32 killed $vcc_lo, $exec_lo, implicit-def dead $scc
150 S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
151 S_BRANCH %bb.1
152
153 bb.1:
154 S_BRANCH %bb.0
155
156 bb.2:
157 S_ENDPGM 0
158 ...
159
160 # GCN: name: negated_cond_vop3_redef_sel
161 # GCN: %0:sgpr_32 = IMPLICIT_DEF
162 # GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
163 # GCN-NEXT: %1:vgpr_32 = COPY $vgpr0
164 # GCN-NEXT: %2:sgpr_32 = V_CMP_NE_U32_e64 %1, 1, implicit $exec
165 # GCN-NEXT: $vcc_lo = S_AND_B32 %2, $exec_lo, implicit-def dead $scc
166 # GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
167 ---
168 name: negated_cond_vop3_redef_sel
169 body: |
170 bb.0:
171 %0:sgpr_32 = IMPLICIT_DEF
172 %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
173 %1:vgpr_32 = COPY $vgpr0
174 %2:sgpr_32 = V_CMP_NE_U32_e64 %1, 1, implicit $exec
175 $vcc_lo = S_AND_B32 killed %2, $exec_lo, implicit-def dead $scc
176 S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
177 S_BRANCH %bb.1
178
179 bb.1:
180 S_BRANCH %bb.0
181
182 bb.2:
183 S_ENDPGM 0
184 ...
185
186 # GCN: name: negated_cond_vop2_used_sel
187 # GCN: %0:sgpr_32 = IMPLICIT_DEF
188 # GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
189 # GCN-NEXT: $vcc_lo = S_ANDN2_B32 $exec_lo, %0, implicit-def $scc
190 # GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
191 ---
192 name: negated_cond_vop2_used_sel
193 body: |
194 bb.0:
195 %0:sgpr_32 = IMPLICIT_DEF
196 %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
197 V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
198 $vcc_lo = S_AND_B32 $exec_lo, killed $vcc_lo, implicit-def dead $scc
199 S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
200 S_BRANCH %bb.1
201
202 bb.1:
203 S_BRANCH %bb.0
204
205 bb.2:
206 $vgpr0 = COPY %1
207 S_ENDPGM 0
208 ...
209
210 # GCN: name: negated_cond_vop2_used_vcc
211 # GCN: %0:sgpr_32 = IMPLICIT_DEF
212 # GCN-NEXT: %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
213 # GCN-NEXT: V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
214 # GCN-NEXT: $sgpr0_sgpr1 = COPY $vcc
215 # GCN-NEXT: $vcc_lo = S_ANDN2_B32 $exec_lo, %0, implicit-def $scc
216 # GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
217 ---
218 name: negated_cond_vop2_used_vcc
219 body: |
220 bb.0:
221 %0:sgpr_32 = IMPLICIT_DEF
222 %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
223 V_CMP_NE_U32_e32 1, %1, implicit-def $vcc, implicit $exec
224 $sgpr0_sgpr1 = COPY $vcc
225 $vcc_lo = S_AND_B32 $exec_lo, killed $vcc_lo, implicit-def dead $scc
226 S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
227 S_BRANCH %bb.1
228
229 bb.1:
230 S_BRANCH %bb.0
231
232 bb.2:
233 S_ENDPGM 0
234 ...
235
236 # GCN: name: negated_cond_vop3_sel_wrong_subreg1
237 # GCN: %0:sgpr_32 = IMPLICIT_DEF
238 # GCN-NEXT: %1.sub1:vreg_64 = IMPLICIT_DEF
239 # GCN-NEXT: %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
240 # GCN-NEXT: %2:sgpr_32 = V_CMP_NE_U32_e64 %1.sub1, 1, implicit $exec
241 # GCN-NEXT: $vcc_lo = S_AND_B32 %2, $exec_lo, implicit-def dead $scc
242 # GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
243 ---
244 name: negated_cond_vop3_sel_wrong_subreg1
245 body: |
246 bb.0:
247 %0:sgpr_32 = IMPLICIT_DEF
248 %1.sub1 = IMPLICIT_DEF
249 %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
250 %2:sgpr_32 = V_CMP_NE_U32_e64 %1.sub1, 1, implicit $exec
251 $vcc_lo = S_AND_B32 killed %2, $exec_lo, implicit-def dead $scc
252 S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
253 S_BRANCH %bb.1
254
255 bb.1:
256 S_BRANCH %bb.0
257
258 bb.2:
259 S_ENDPGM 0
260 ...
261
262 # GCN: name: negated_cond_vop3_sel_wrong_subreg2
263 # GCN: %0:sgpr_32 = IMPLICIT_DEF
264 # GCN-NEXT: %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
265 # GCN-NEXT: %1.sub1:vreg_64 = IMPLICIT_DEF
266 # GCN-NEXT: %2:sgpr_32 = V_CMP_NE_U32_e64 %1.sub1, 1, implicit $exec
267 # GCN-NEXT: $vcc_lo = S_AND_B32 %2, $exec_lo, implicit-def dead $scc
268 # GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
269 ---
270 name: negated_cond_vop3_sel_wrong_subreg2
271 body: |
272 bb.0:
273 %0:sgpr_32 = IMPLICIT_DEF
274 %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
275 %1.sub1 = IMPLICIT_DEF
276 %2:sgpr_32 = V_CMP_NE_U32_e64 %1.sub1, 1, implicit $exec
277 $vcc_lo = S_AND_B32 killed %2, $exec_lo, implicit-def dead $scc
278 S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
279 S_BRANCH %bb.1
280
281 bb.1:
282 S_BRANCH %bb.0
283
284 bb.2:
285 S_ENDPGM 0
286 ...
287
288 # GCN: name: negated_cond_vop3_sel_right_subreg1
289 # GCN: %0:sgpr_32 = IMPLICIT_DEF
290 # GCN-NEXT: %1.sub1:vreg_64 = IMPLICIT_DEF
291 # GCN-NEXT: $vcc_lo = S_ANDN2_B32 $exec_lo, %0, implicit-def $scc
292 # GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
293 ---
294 name: negated_cond_vop3_sel_right_subreg1
295 body: |
296 bb.0:
297 %0:sgpr_32 = IMPLICIT_DEF
298 %1.sub1 = IMPLICIT_DEF
299 %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
300 %2:sgpr_32 = V_CMP_NE_U32_e64 %1.sub0, 1, implicit $exec
301 $vcc_lo = S_AND_B32 killed %2, $exec_lo, implicit-def dead $scc
302 S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
303 S_BRANCH %bb.1
304
305 bb.1:
306 S_BRANCH %bb.0
307
308 bb.2:
309 S_ENDPGM 0
310 ...
311
312 # GCN: name: negated_cond_vop3_sel_right_subreg2
313 # GCN: %0:sgpr_32 = IMPLICIT_DEF
314 # GCN-NEXT: %1.sub1:vreg_64 = IMPLICIT_DEF
315 # GCN-NEXT: $vcc_lo = S_ANDN2_B32 $exec_lo, %0, implicit-def $scc
316 # GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
317 ---
318 name: negated_cond_vop3_sel_right_subreg2
319 body: |
320 bb.0:
321 %0:sgpr_32 = IMPLICIT_DEF
322 %1.sub0:vreg_64 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
323 %1.sub1 = IMPLICIT_DEF
324 %2:sgpr_32 = V_CMP_NE_U32_e64 %1.sub0, 1, implicit $exec
325 $vcc_lo = S_AND_B32 killed %2, $exec_lo, implicit-def dead $scc
326 S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
327 S_BRANCH %bb.1
328
329 bb.1:
330 S_BRANCH %bb.0
331
332 bb.2:
333 S_ENDPGM 0
334 ...
335
336 # GCN: name: negated_cond_vop3_sel_subreg_overlap
337 # GCN: %0:sgpr_32 = IMPLICIT_DEF
338 # GCN-NEXT: %1.sub2:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
339 # GCN-NEXT: %1.sub2_sub3:vreg_128 = IMPLICIT_DEF
340 # GCN-NEXT: %2:sgpr_32 = V_CMP_NE_U32_e64 %1.sub2, 1, implicit $exec
341 # GCN-NEXT: $vcc_lo = S_AND_B32 %2, $exec_lo, implicit-def dead $scc
342 # GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
343 ---
344 name: negated_cond_vop3_sel_subreg_overlap
345 body: |
346 bb.0:
347 %0:sgpr_32 = IMPLICIT_DEF
348 %1.sub2:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
349 %1.sub2_sub3 = IMPLICIT_DEF
350 %2:sgpr_32 = V_CMP_NE_U32_e64 %1.sub2, 1, implicit $exec
351 $vcc_lo = S_AND_B32 killed %2, $exec_lo, implicit-def dead $scc
352 S_CBRANCH_VCCNZ %bb.2, implicit killed $vcc
353 S_BRANCH %bb.1
354
355 bb.1:
356 S_BRANCH %bb.0
357
358 bb.2:
359 S_ENDPGM 0
360 ...
0 # RUN: llvm-mc -arch=amdgcn -mcpu=gfx1010 -show-encoding < %s | FileCheck -check-prefix=GFX10 %s
1
2 v_add_co_ci_u32_e32 v3, vcc_lo, 12345, v3, vcc_lo
3 // GFX10: v_add_co_ci_u32_e32 v3, vcc_lo, 0x3039, v3, vcc_lo ; encoding: [0xff,0x06,0x06,0x50,0x39,0x30,0x00,0x00]
4
5 v_cndmask_b32 v0, 12345, v1, vcc_lo
6 // GFX10: v_cndmask_b32_e32 v0, 0x3039, v1, vcc_lo ; encoding: [0xff,0x02,0x00,0x02,0x39,0x30,0x00,0x00]
0 // RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=+WavefrontSize32,-WavefrontSize64 -show-encoding %s | FileCheck --check-prefixes=GFX10,W32 %s
1 // RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=-WavefrontSize32,+WavefrontSize64 -show-encoding %s | FileCheck --check-prefixes=GFX10,W64 %s
2 // RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=+WavefrontSize32,-WavefrontSize64 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX10-ERR,W32-ERR %s
3 // RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=-WavefrontSize32,+WavefrontSize64 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX10-ERR,W64-ERR %s
4
5 //===----------------------------------------------------------------------===//
6 // ENC_DS.
7 //===----------------------------------------------------------------------===//
8
9 ds_add_u32 v0, v1
10 // GFX10: encoding: [0x00,0x00,0x00,0xd8,0x00,0x01,0x00,0x00]
11
12 ds_add_u32 v255, v254
13 // GFX10: encoding: [0x00,0x00,0x00,0xd8,0xff,0xfe,0x00,0x00]
14
15 ds_add_u32 v0, v254
16 // GFX10: encoding: [0x00,0x00,0x00,0xd8,0x00,0xfe,0x00,0x00]
17
18 ds_add_u32 v255, v1
19 // GFX10: encoding: [0x00,0x00,0x00,0xd8,0xff,0x01,0x00,0x00]
20
21 ds_add_u32 v0, v1 offset:0
22 // GFX10: encoding: [0x00,0x00,0x00,0xd8,0x00,0x01,0x00,0x00]
23
24 ds_add_u32 v255, v254 offset:0
25 // GFX10: encoding: [0x00,0x00,0x00,0xd8,0xff,0xfe,0x00,0x00]
26
27 ds_add_u32 v0, v254 offset:0
28 // GFX10: encoding: [0x00,0x00,0x00,0xd8,0x00,0xfe,0x00,0x00]
29
30 ds_add_u32 v255, v1 offset:0
31 // GFX10: encoding: [0x00,0x00,0x00,0xd8,0xff,0x01,0x00,0x00]
32
33 ds_add_u32 v0, v1 offset:4660
34 // GFX10: encoding: [0x34,0x12,0x00,0xd8,0x00,0x01,0x00,0x00]
35
36 ds_add_u32 v255, v254 offset:4660
37 // GFX10: encoding: [0x34,0x12,0x00,0xd8,0xff,0xfe,0x00,0x00]
38
39 ds_add_u32 v0, v254 offset:4660
40 // GFX10: encoding: [0x34,0x12,0x00,0xd8,0x00,0xfe,0x00,0x00]
41
42 ds_add_u32 v255, v1 offset:4660
43 // GFX10: encoding: [0x34,0x12,0x00,0xd8,0xff,0x01,0x00,0x00]
44
45 ds_add_u32 v0, v1 offset:65535
46 // GFX10: encoding: [0xff,0xff,0x00,0xd8,0x00,0x01,0x00,0x00]
47
48 ds_add_u32 v255, v254 offset:65535
49 // GFX10: encoding: [0xff,0xff,0x00,0xd8,0xff,0xfe,0x00,0x00]
50
51 ds_add_u32 v0, v254 offset:65535
52 // GFX10: encoding: [0xff,0xff,0x00,0xd8,0x00,0xfe,0x00,0x00]
53
54 ds_add_u32 v255, v1 offset:65535
55 // GFX10: encoding: [0xff,0xff,0x00,0xd8,0xff,0x01,0x00,0x00]
56
57 ds_add_u32 v0, v1 gds
58 // GFX10: encoding: [0x00,0x00,0x02,0xd8,0x00,0x01,0x00,0x00]
59
60 ds_add_u32 v255, v254 gds
61 // GFX10: encoding: [0x00,0x00,0x02,0xd8,0xff,0xfe,0x00,0x00]
62
63 ds_add_u32 v0, v254 gds
64 // GFX10: encoding: [0x00,0x00,0x02,0xd8,0x00,0xfe,0x00,0x00]
65
66 ds_add_u32 v255, v1 gds
67 // GFX10: encoding: [0x00,0x00,0x02,0xd8,0xff,0x01,0x00,0x00]
68
69 ds_add_u32 v0, v1 offset:0 gds
70 // GFX10: encoding: [0x00,0x00,0x02,0xd8,0x00,0x01,0x00,0x00]
71
72 ds_add_u32 v255, v254 offset:0 gds
73 // GFX10: encoding: [0x00,0x00,0x02,0xd8,0xff,0xfe,0x00,0x00]
74
75 ds_add_u32 v0, v254 offset:0 gds
76 // GFX10: encoding: [0x00,0x00,0x02,0xd8,0x00,0xfe,0x00,0x00]
77
78 ds_add_u32 v255, v1 offset:0 gds
79 // GFX10: encoding: [0x00,0x00,0x02,0xd8,0xff,0x01,0x00,0x00]
80
81 ds_add_u32 v0, v1 offset:4660 gds
82 // GFX10: encoding: [0x34,0x12,0x02,0xd8,0x00,0x01,0x00,0x00]
83
84 ds_add_u32 v255, v254 offset:4660 gds
85 // GFX10: encoding: [0x34,0x12,0x02,0xd8,0xff,0xfe,0x00,0x00]
86
87 ds_add_u32 v0, v254 offset:4660 gds
88 // GFX10: encoding: [0x34,0x12,0x02,0xd8,0x00,0xfe,0x00,0x00]
89
90 ds_add_u32 v255, v1 offse