llvm.org GIT mirror llvm / d706d03
AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@298444 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 2 years ago
714 changed file(s) with 6249 addition(s) and 6249 deletion(s). Raw diff Collapse all Expand all
22
33 ; CHECK: 'add_i32'
44 ; CHECK: estimated cost of 1 for {{.*}} add i32
5 define void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
5 define amdgpu_kernel void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
66 %vec = load i32, i32 addrspace(1)* %vaddr
77 %add = add i32 %vec, %b
88 store i32 %add, i32 addrspace(1)* %out
1111
1212 ; CHECK: 'add_v2i32'
1313 ; CHECK: estimated cost of 2 for {{.*}} add <2 x i32>
14 define void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
14 define amdgpu_kernel void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
1515 %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
1616 %add = add <2 x i32> %vec, %b
1717 store <2 x i32> %add, <2 x i32> addrspace(1)* %out
2020
2121 ; CHECK: 'add_v3i32'
2222 ; CHECK: estimated cost of 3 for {{.*}} add <3 x i32>
23 define void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
23 define amdgpu_kernel void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
2424 %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
2525 %add = add <3 x i32> %vec, %b
2626 store <3 x i32> %add, <3 x i32> addrspace(1)* %out
2929
3030 ; CHECK: 'add_v4i32'
3131 ; CHECK: estimated cost of 4 for {{.*}} add <4 x i32>
32 define void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
32 define amdgpu_kernel void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
3333 %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
3434 %add = add <4 x i32> %vec, %b
3535 store <4 x i32> %add, <4 x i32> addrspace(1)* %out
3838
3939 ; CHECK: 'add_i64'
4040 ; CHECK: estimated cost of 2 for {{.*}} add i64
41 define void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
41 define amdgpu_kernel void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
4242 %vec = load i64, i64 addrspace(1)* %vaddr
4343 %add = add i64 %vec, %b
4444 store i64 %add, i64 addrspace(1)* %out
4747
4848 ; CHECK: 'add_v2i64'
4949 ; CHECK: estimated cost of 4 for {{.*}} add <2 x i64>
50 define void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
50 define amdgpu_kernel void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
5151 %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
5252 %add = add <2 x i64> %vec, %b
5353 store <2 x i64> %add, <2 x i64> addrspace(1)* %out
5656
5757 ; CHECK: 'add_v3i64'
5858 ; CHECK: estimated cost of 6 for {{.*}} add <3 x i64>
59 define void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
59 define amdgpu_kernel void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
6060 %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
6161 %add = add <3 x i64> %vec, %b
6262 store <3 x i64> %add, <3 x i64> addrspace(1)* %out
6565
6666 ; CHECK: 'add_v4i64'
6767 ; CHECK: estimated cost of 8 for {{.*}} add <4 x i64>
68 define void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
68 define amdgpu_kernel void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
6969 %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
7070 %add = add <4 x i64> %vec, %b
7171 store <4 x i64> %add, <4 x i64> addrspace(1)* %out
7474
7575 ; CHECK: 'add_v16i64'
7676 ; CHECK: estimated cost of 32 for {{.*}} add <16 x i64>
77 define void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 {
77 define amdgpu_kernel void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 {
7878 %vec = load <16 x i64>, <16 x i64> addrspace(1)* %vaddr
7979 %add = add <16 x i64> %vec, %b
8080 store <16 x i64> %add, <16 x i64> addrspace(1)* %out
8383
8484 ; CHECK: 'add_i16'
8585 ; CHECK: estimated cost of 1 for {{.*}} add i16
86 define void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
86 define amdgpu_kernel void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
8787 %vec = load i16, i16 addrspace(1)* %vaddr
8888 %add = add i16 %vec, %b
8989 store i16 %add, i16 addrspace(1)* %out
9292
9393 ; CHECK: 'add_v2i16'
9494 ; CHECK: estimated cost of 2 for {{.*}} add <2 x i16>
95 define void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
95 define amdgpu_kernel void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
9696 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
9797 %add = add <2 x i16> %vec, %b
9898 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
101101
102102 ; CHECK: 'sub_i32'
103103 ; CHECK: estimated cost of 1 for {{.*}} sub i32
104 define void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
104 define amdgpu_kernel void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
105105 %vec = load i32, i32 addrspace(1)* %vaddr
106106 %sub = sub i32 %vec, %b
107107 store i32 %sub, i32 addrspace(1)* %out
110110
111111 ; CHECK: 'sub_i64'
112112 ; CHECK: estimated cost of 2 for {{.*}} sub i64
113 define void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
113 define amdgpu_kernel void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
114114 %vec = load i64, i64 addrspace(1)* %vaddr
115115 %sub = sub i64 %vec, %b
116116 store i64 %sub, i64 addrspace(1)* %out
118118 }
119119 ; CHECK: 'sub_i16'
120120 ; CHECK: estimated cost of 1 for {{.*}} sub i16
121 define void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
121 define amdgpu_kernel void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
122122 %vec = load i16, i16 addrspace(1)* %vaddr
123123 %sub = sub i16 %vec, %b
124124 store i16 %sub, i16 addrspace(1)* %out
127127
128128 ; CHECK: 'sub_v2i16'
129129 ; CHECK: estimated cost of 2 for {{.*}} sub <2 x i16>
130 define void @sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
130 define amdgpu_kernel void @sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
131131 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
132132 %sub = sub <2 x i16> %vec, %b
133133 store <2 x i16> %sub, <2 x i16> addrspace(1)* %out
11
22 ; CHECK: 'or_i32'
33 ; CHECK: estimated cost of 1 for {{.*}} or i32
4 define void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
4 define amdgpu_kernel void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
55 %vec = load i32, i32 addrspace(1)* %vaddr
66 %or = or i32 %vec, %b
77 store i32 %or, i32 addrspace(1)* %out
1010
1111 ; CHECK: 'or_i64'
1212 ; CHECK: estimated cost of 2 for {{.*}} or i64
13 define void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
13 define amdgpu_kernel void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
1414 %vec = load i64, i64 addrspace(1)* %vaddr
1515 %or = or i64 %vec, %b
1616 store i64 %or, i64 addrspace(1)* %out
1919
2020 ; CHECK: 'xor_i32'
2121 ; CHECK: estimated cost of 1 for {{.*}} xor i32
22 define void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
22 define amdgpu_kernel void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
2323 %vec = load i32, i32 addrspace(1)* %vaddr
2424 %or = xor i32 %vec, %b
2525 store i32 %or, i32 addrspace(1)* %out
2828
2929 ; CHECK: 'xor_i64'
3030 ; CHECK: estimated cost of 2 for {{.*}} xor i64
31 define void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
31 define amdgpu_kernel void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
3232 %vec = load i64, i64 addrspace(1)* %vaddr
3333 %or = xor i64 %vec, %b
3434 store i64 %or, i64 addrspace(1)* %out
3838
3939 ; CHECK: 'and_i32'
4040 ; CHECK: estimated cost of 1 for {{.*}} and i32
41 define void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
41 define amdgpu_kernel void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
4242 %vec = load i32, i32 addrspace(1)* %vaddr
4343 %or = and i32 %vec, %b
4444 store i32 %or, i32 addrspace(1)* %out
4747
4848 ; CHECK: 'and_i64'
4949 ; CHECK: estimated cost of 2 for {{.*}} and i64
50 define void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
50 define amdgpu_kernel void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
5151 %vec = load i64, i64 addrspace(1)* %vaddr
5252 %or = and i64 %vec, %b
5353 store i64 %or, i64 addrspace(1)* %out
33 ; CHECK: estimated cost of 10 for instruction: br i1
44 ; CHECK: estimated cost of 10 for instruction: br label
55 ; CHECK: estimated cost of 10 for instruction: ret void
6 define void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
6 define amdgpu_kernel void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
77 bb0:
88 br i1 undef, label %bb1, label %bb2
99
2020
2121 ; CHECK: 'test_switch_cost'
2222 ; CHECK: Unknown cost for instruction: switch
23 define void @test_switch_cost(i32 %a) #0 {
23 define amdgpu_kernel void @test_switch_cost(i32 %a) #0 {
2424 entry:
2525 switch i32 %a, label %default [
2626 i32 0, label %case0
11
22 ; CHECK: 'extractelement_v2i32'
33 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i32>
4 define void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
4 define amdgpu_kernel void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
55 %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
66 %elt = extractelement <2 x i32> %vec, i32 1
77 store i32 %elt, i32 addrspace(1)* %out
1010
1111 ; CHECK: 'extractelement_v2f32'
1212 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x float>
13 define void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
13 define amdgpu_kernel void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
1414 %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
1515 %elt = extractelement <2 x float> %vec, i32 1
1616 store float %elt, float addrspace(1)* %out
1919
2020 ; CHECK: 'extractelement_v3i32'
2121 ; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i32>
22 define void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
22 define amdgpu_kernel void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
2323 %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
2424 %elt = extractelement <3 x i32> %vec, i32 1
2525 store i32 %elt, i32 addrspace(1)* %out
2828
2929 ; CHECK: 'extractelement_v4i32'
3030 ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i32>
31 define void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
31 define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
3232 %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
3333 %elt = extractelement <4 x i32> %vec, i32 1
3434 store i32 %elt, i32 addrspace(1)* %out
3737
3838 ; CHECK: 'extractelement_v8i32'
3939 ; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i32>
40 define void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
40 define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
4141 %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
4242 %elt = extractelement <8 x i32> %vec, i32 1
4343 store i32 %elt, i32 addrspace(1)* %out
4747 ; FIXME: Should be non-0
4848 ; CHECK: 'extractelement_v8i32_dynindex'
4949 ; CHECK: estimated cost of 2 for {{.*}} extractelement <8 x i32>
50 define void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
50 define amdgpu_kernel void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
5151 %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
5252 %elt = extractelement <8 x i32> %vec, i32 %idx
5353 store i32 %elt, i32 addrspace(1)* %out
5656
5757 ; CHECK: 'extractelement_v2i64'
5858 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i64>
59 define void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
59 define amdgpu_kernel void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
6060 %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
6161 %elt = extractelement <2 x i64> %vec, i64 1
6262 store i64 %elt, i64 addrspace(1)* %out
6565
6666 ; CHECK: 'extractelement_v3i64'
6767 ; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i64>
68 define void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
68 define amdgpu_kernel void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
6969 %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
7070 %elt = extractelement <3 x i64> %vec, i64 1
7171 store i64 %elt, i64 addrspace(1)* %out
7474
7575 ; CHECK: 'extractelement_v4i64'
7676 ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i64>
77 define void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
77 define amdgpu_kernel void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
7878 %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
7979 %elt = extractelement <4 x i64> %vec, i64 1
8080 store i64 %elt, i64 addrspace(1)* %out
8383
8484 ; CHECK: 'extractelement_v8i64'
8585 ; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i64>
86 define void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
86 define amdgpu_kernel void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
8787 %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
8888 %elt = extractelement <8 x i64> %vec, i64 1
8989 store i64 %elt, i64 addrspace(1)* %out
9292
9393 ; CHECK: 'extractelement_v4i8'
9494 ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i8>
95 define void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
95 define amdgpu_kernel void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
9696 %vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr
9797 %elt = extractelement <4 x i8> %vec, i8 1
9898 store i8 %elt, i8 addrspace(1)* %out
101101
102102 ; CHECK: 'extractelement_v2i16'
103103 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i16>
104 define void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
104 define amdgpu_kernel void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
105105 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
106106 %elt = extractelement <2 x i16> %vec, i16 1
107107 store i16 %elt, i16 addrspace(1)* %out
11
22 ; CHECK: 'fabs_f32'
33 ; CHECK: estimated cost of 0 for {{.*}} call float @llvm.fabs.f32
4 define void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
4 define amdgpu_kernel void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
55 %vec = load float, float addrspace(1)* %vaddr
66 %fabs = call float @llvm.fabs.f32(float %vec) #1
77 store float %fabs, float addrspace(1)* %out
1010
1111 ; CHECK: 'fabs_v2f32'
1212 ; CHECK: estimated cost of 0 for {{.*}} call <2 x float> @llvm.fabs.v2f32
13 define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
13 define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
1414 %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
1515 %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %vec) #1
1616 store <2 x float> %fabs, <2 x float> addrspace(1)* %out
1919
2020 ; CHECK: 'fabs_v3f32'
2121 ; CHECK: estimated cost of 0 for {{.*}} call <3 x float> @llvm.fabs.v3f32
22 define void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
22 define amdgpu_kernel void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
2323 %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
2424 %fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %vec) #1
2525 store <3 x float> %fabs, <3 x float> addrspace(1)* %out
2828
2929 ; CHECK: 'fabs_f64'
3030 ; CHECK: estimated cost of 0 for {{.*}} call double @llvm.fabs.f64
31 define void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
31 define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
3232 %vec = load double, double addrspace(1)* %vaddr
3333 %fabs = call double @llvm.fabs.f64(double %vec) #1
3434 store double %fabs, double addrspace(1)* %out
3737
3838 ; CHECK: 'fabs_v2f64'
3939 ; CHECK: estimated cost of 0 for {{.*}} call <2 x double> @llvm.fabs.v2f64
40 define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
40 define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
4141 %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
4242 %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %vec) #1
4343 store <2 x double> %fabs, <2 x double> addrspace(1)* %out
4646
4747 ; CHECK: 'fabs_v3f64'
4848 ; CHECK: estimated cost of 0 for {{.*}} call <3 x double> @llvm.fabs.v3f64
49 define void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
49 define amdgpu_kernel void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
5050 %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
5151 %fabs = call <3 x double> @llvm.fabs.v3f64(<3 x double> %vec) #1
5252 store <3 x double> %fabs, <3 x double> addrspace(1)* %out
5555
5656 ; CHECK: 'fabs_f16'
5757 ; CHECK: estimated cost of 0 for {{.*}} call half @llvm.fabs.f16
58 define void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
58 define amdgpu_kernel void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
5959 %vec = load half, half addrspace(1)* %vaddr
6060 %fabs = call half @llvm.fabs.f16(half %vec) #1
6161 store half %fabs, half addrspace(1)* %out
6464
6565 ; CHECK: 'fabs_v2f16'
6666 ; CHECK: estimated cost of 0 for {{.*}} call <2 x half> @llvm.fabs.v2f16
67 define void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
67 define amdgpu_kernel void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
6868 %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
6969 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %vec) #1
7070 store <2 x half> %fabs, <2 x half> addrspace(1)* %out
7373
7474 ; CHECK: 'fabs_v3f16'
7575 ; CHECK: estimated cost of 0 for {{.*}} call <3 x half> @llvm.fabs.v3f16
76 define void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
76 define amdgpu_kernel void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
7777 %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
7878 %fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %vec) #1
7979 store <3 x half> %fabs, <3 x half> addrspace(1)* %out
22
33 ; ALL: 'fadd_f32'
44 ; ALL: estimated cost of 1 for {{.*}} fadd float
5 define void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
5 define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
66 %vec = load float, float addrspace(1)* %vaddr
77 %add = fadd float %vec, %b
88 store float %add, float addrspace(1)* %out
1111
1212 ; ALL: 'fadd_v2f32'
1313 ; ALL: estimated cost of 2 for {{.*}} fadd <2 x float>
14 define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
14 define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
1515 %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
1616 %add = fadd <2 x float> %vec, %b
1717 store <2 x float> %add, <2 x float> addrspace(1)* %out
2020
2121 ; ALL: 'fadd_v3f32'
2222 ; ALL: estimated cost of 3 for {{.*}} fadd <3 x float>
23 define void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
23 define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
2424 %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
2525 %add = fadd <3 x float> %vec, %b
2626 store <3 x float> %add, <3 x float> addrspace(1)* %out
3030 ; ALL: 'fadd_f64'
3131 ; FASTF64: estimated cost of 2 for {{.*}} fadd double
3232 ; SLOWF64: estimated cost of 3 for {{.*}} fadd double
33 define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
33 define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
3434 %vec = load double, double addrspace(1)* %vaddr
3535 %add = fadd double %vec, %b
3636 store double %add, double addrspace(1)* %out
4040 ; ALL: 'fadd_v2f64'
4141 ; FASTF64: estimated cost of 4 for {{.*}} fadd <2 x double>
4242 ; SLOWF64: estimated cost of 6 for {{.*}} fadd <2 x double>
43 define void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
43 define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
4444 %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
4545 %add = fadd <2 x double> %vec, %b
4646 store <2 x double> %add, <2 x double> addrspace(1)* %out
5050 ; ALL: 'fadd_v3f64'
5151 ; FASTF64: estimated cost of 6 for {{.*}} fadd <3 x double>
5252 ; SLOWF64: estimated cost of 9 for {{.*}} fadd <3 x double>
53 define void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
53 define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
5454 %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
5555 %add = fadd <3 x double> %vec, %b
5656 store <3 x double> %add, <3 x double> addrspace(1)* %out
5959
6060 ; ALL 'fadd_f16'
6161 ; ALL estimated cost of 1 for {{.*}} fadd half
62 define void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
62 define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
6363 %vec = load half, half addrspace(1)* %vaddr
6464 %add = fadd half %vec, %b
6565 store half %add, half addrspace(1)* %out
6868
6969 ; ALL 'fadd_v2f16'
7070 ; ALL estimated cost of 2 for {{.*}} fadd <2 x half>
71 define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
71 define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
7272 %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
7373 %add = fadd <2 x half> %vec, %b
7474 store <2 x half> %add, <2 x half> addrspace(1)* %out
7777
7878 ; ALL 'fadd_v4f16'
7979 ; ALL estimated cost of 4 for {{.*}} fadd <4 x half>
80 define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
80 define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
8181 %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
8282 %add = fadd <4 x half> %vec, %b
8383 store <4 x half> %add, <4 x half> addrspace(1)* %out
44
55 ; CHECK: 'fdiv_f32'
66 ; ALL: estimated cost of 10 for {{.*}} fdiv float
7 define void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
7 define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
88 %vec = load float, float addrspace(1)* %vaddr
99 %add = fdiv float %vec, %b
1010 store float %add, float addrspace(1)* %out
1313
1414 ; ALL: 'fdiv_v2f32'
1515 ; ALL: estimated cost of 20 for {{.*}} fdiv <2 x float>
16 define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
16 define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
1717 %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
1818 %add = fdiv <2 x float> %vec, %b
1919 store <2 x float> %add, <2 x float> addrspace(1)* %out
2222
2323 ; ALL: 'fdiv_v3f32'
2424 ; ALL: estimated cost of 30 for {{.*}} fdiv <3 x float>
25 define void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
25 define amdgpu_kernel void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
2626 %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
2727 %add = fdiv <3 x float> %vec, %b
2828 store <3 x float> %add, <3 x float> addrspace(1)* %out
3434 ; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double
3535 ; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double
3636 ; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double
37 define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
37 define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
3838 %vec = load double, double addrspace(1)* %vaddr
3939 %add = fdiv double %vec, %b
4040 store double %add, double addrspace(1)* %out
4646 ; CISLOWF64: estimated cost of 66 for {{.*}} fdiv <2 x double>
4747 ; SIFASTF64: estimated cost of 64 for {{.*}} fdiv <2 x double>
4848 ; SISLOWF64: estimated cost of 72 for {{.*}} fdiv <2 x double>
49 define void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
49 define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
5050 %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
5151 %add = fdiv <2 x double> %vec, %b
5252 store <2 x double> %add, <2 x double> addrspace(1)* %out
5858 ; CISLOWF64: estimated cost of 99 for {{.*}} fdiv <3 x double>
5959 ; SIFASTF64: estimated cost of 96 for {{.*}} fdiv <3 x double>
6060 ; SISLOWF64: estimated cost of 108 for {{.*}} fdiv <3 x double>
61 define void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
61 define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
6262 %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
6363 %add = fdiv <3 x double> %vec, %b
6464 store <3 x double> %add, <3 x double> addrspace(1)* %out
6767
6868 ; ALL: 'fdiv_f16'
6969 ; ALL: estimated cost of 10 for {{.*}} fdiv half
70 define void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
70 define amdgpu_kernel void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
7171 %vec = load half, half addrspace(1)* %vaddr
7272 %add = fdiv half %vec, %b
7373 store half %add, half addrspace(1)* %out
7676
7777 ; ALL: 'fdiv_v2f16'
7878 ; ALL: estimated cost of 20 for {{.*}} fdiv <2 x half>
79 define void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
79 define amdgpu_kernel void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
8080 %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
8181 %add = fdiv <2 x half> %vec, %b
8282 store <2 x half> %add, <2 x half> addrspace(1)* %out
8585
8686 ; ALL: 'fdiv_v4f16'
8787 ; ALL: estimated cost of 40 for {{.*}} fdiv <4 x half>
88 define void @fdiv_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
88 define amdgpu_kernel void @fdiv_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
8989 %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
9090 %add = fdiv <4 x half> %vec, %b
9191 store <4 x half> %add, <4 x half> addrspace(1)* %out
22
33 ; ALL: 'fmul_f32'
44 ; ALL: estimated cost of 1 for {{.*}} fmul float
5 define void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
5 define amdgpu_kernel void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
66 %vec = load float, float addrspace(1)* %vaddr
77 %add = fmul float %vec, %b
88 store float %add, float addrspace(1)* %out
1111
1212 ; ALL: 'fmul_v2f32'
1313 ; ALL: estimated cost of 2 for {{.*}} fmul <2 x float>
14 define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
14 define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
1515 %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
1616 %add = fmul <2 x float> %vec, %b
1717 store <2 x float> %add, <2 x float> addrspace(1)* %out
2020
2121 ; ALL: 'fmul_v3f32'
2222 ; ALL: estimated cost of 3 for {{.*}} fmul <3 x float>
23 define void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
23 define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
2424 %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
2525 %add = fmul <3 x float> %vec, %b
2626 store <3 x float> %add, <3 x float> addrspace(1)* %out
3030 ; ALL: 'fmul_f64'
3131 ; FASTF64: estimated cost of 2 for {{.*}} fmul double
3232 ; SLOWF64: estimated cost of 3 for {{.*}} fmul double
33 define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
33 define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
3434 %vec = load double, double addrspace(1)* %vaddr
3535 %add = fmul double %vec, %b
3636 store double %add, double addrspace(1)* %out
4040 ; ALL: 'fmul_v2f64'
4141 ; FASTF64: estimated cost of 4 for {{.*}} fmul <2 x double>
4242 ; SLOWF64: estimated cost of 6 for {{.*}} fmul <2 x double>
43 define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
43 define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
4444 %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
4545 %add = fmul <2 x double> %vec, %b
4646 store <2 x double> %add, <2 x double> addrspace(1)* %out
5050 ; ALL: 'fmul_v3f64'
5151 ; FASTF64: estimated cost of 6 for {{.*}} fmul <3 x double>
5252 ; SLOWF64: estimated cost of 9 for {{.*}} fmul <3 x double>
53 define void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
53 define amdgpu_kernel void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
5454 %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
5555 %add = fmul <3 x double> %vec, %b
5656 store <3 x double> %add, <3 x double> addrspace(1)* %out
5959
6060 ; ALL 'fmul_f16'
6161 ; ALL estimated cost of 1 for {{.*}} fmul half
62 define void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
62 define amdgpu_kernel void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
6363 %vec = load half, half addrspace(1)* %vaddr
6464 %add = fmul half %vec, %b
6565 store half %add, half addrspace(1)* %out
6868
6969 ; ALL 'fmul_v2f16'
7070 ; ALL estimated cost of 2 for {{.*}} fmul <2 x half>
71 define void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
71 define amdgpu_kernel void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
7272 %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
7373 %add = fmul <2 x half> %vec, %b
7474 store <2 x half> %add, <2 x half> addrspace(1)* %out
7777
7878 ; ALL 'fmul_v4f16'
7979 ; ALL estimated cost of 4 for {{.*}} fmul <4 x half>
80 define void @fmul_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
80 define amdgpu_kernel void @fmul_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
8181 %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
8282 %add = fmul <4 x half> %vec, %b
8383 store <4 x half> %add, <4 x half> addrspace(1)* %out
22
33 ; ALL: 'fsub_f32'
44 ; ALL: estimated cost of 1 for {{.*}} fsub float
5 define void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
5 define amdgpu_kernel void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
66 %vec = load float, float addrspace(1)* %vaddr
77 %add = fsub float %vec, %b
88 store float %add, float addrspace(1)* %out
1111
1212 ; ALL: 'fsub_v2f32'
1313 ; ALL: estimated cost of 2 for {{.*}} fsub <2 x float>
14 define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
14 define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
1515 %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
1616 %add = fsub <2 x float> %vec, %b
1717 store <2 x float> %add, <2 x float> addrspace(1)* %out
2020
2121 ; ALL: 'fsub_v3f32'
2222 ; ALL: estimated cost of 3 for {{.*}} fsub <3 x float>
23 define void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
23 define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
2424 %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
2525 %add = fsub <3 x float> %vec, %b
2626 store <3 x float> %add, <3 x float> addrspace(1)* %out
3030 ; ALL: 'fsub_f64'
3131 ; FASTF64: estimated cost of 2 for {{.*}} fsub double
3232 ; SLOWF64: estimated cost of 3 for {{.*}} fsub double
33 define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
33 define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
3434 %vec = load double, double addrspace(1)* %vaddr
3535 %add = fsub double %vec, %b
3636 store double %add, double addrspace(1)* %out
4040 ; ALL: 'fsub_v2f64'
4141 ; FASTF64: estimated cost of 4 for {{.*}} fsub <2 x double>
4242 ; SLOWF64: estimated cost of 6 for {{.*}} fsub <2 x double>
43 define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
43 define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
4444 %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
4545 %add = fsub <2 x double> %vec, %b
4646 store <2 x double> %add, <2 x double> addrspace(1)* %out
5050 ; ALL: 'fsub_v3f64'
5151 ; FASTF64: estimated cost of 6 for {{.*}} fsub <3 x double>
5252 ; SLOWF64: estimated cost of 9 for {{.*}} fsub <3 x double>
53 define void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
53 define amdgpu_kernel void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
5454 %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
5555 %add = fsub <3 x double> %vec, %b
5656 store <3 x double> %add, <3 x double> addrspace(1)* %out
5959
6060 ; ALL: 'fsub_f16'
6161 ; ALL: estimated cost of 1 for {{.*}} fsub half
62 define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
62 define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
6363 %vec = load half, half addrspace(1)* %vaddr
6464 %add = fsub half %vec, %b
6565 store half %add, half addrspace(1)* %out
6868
6969 ; ALL: 'fsub_v2f16'
7070 ; ALL: estimated cost of 2 for {{.*}} fsub <2 x half>
71 define void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
71 define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
7272 %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
7373 %add = fsub <2 x half> %vec, %b
7474 store <2 x half> %add, <2 x half> addrspace(1)* %out
7777
7878 ; ALL: 'fsub_v4f16'
7979 ; ALL: estimated cost of 4 for {{.*}} fsub <4 x half>
80 define void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
80 define amdgpu_kernel void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
8181 %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
8282 %add = fsub <4 x half> %vec, %b
8383 store <4 x half> %add, <4 x half> addrspace(1)* %out
11
22 ; CHECK: 'insertelement_v2i32'
33 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i32>
4 define void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
4 define amdgpu_kernel void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
55 %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
66 %insert = insertelement <2 x i32> %vec, i32 1, i32 123
77 store <2 x i32> %insert, <2 x i32> addrspace(1)* %out
1010
1111 ; CHECK: 'insertelement_v2i64'
1212 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i64>
13 define void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
13 define amdgpu_kernel void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
1414 %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
1515 %insert = insertelement <2 x i64> %vec, i64 1, i64 123
1616 store <2 x i64> %insert, <2 x i64> addrspace(1)* %out
1919
2020 ; CHECK: 'insertelement_v2i16'
2121 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i16>
22 define void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
22 define amdgpu_kernel void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
2323 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
2424 %insert = insertelement <2 x i16> %vec, i16 1, i16 123
2525 store <2 x i16> %insert, <2 x i16> addrspace(1)* %out
2828
2929 ; CHECK: 'insertelement_v2i8'
3030 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i8>
31 define void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
31 define amdgpu_kernel void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
3232 %vec = load <2 x i8>, <2 x i8> addrspace(1)* %vaddr
3333 %insert = insertelement <2 x i8> %vec, i8 1, i8 123
3434 store <2 x i8> %insert, <2 x i8> addrspace(1)* %out
11
22 ; CHECK: 'mul_i32'
33 ; CHECK: estimated cost of 3 for {{.*}} mul i32
4 define void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
4 define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
55 %vec = load i32, i32 addrspace(1)* %vaddr
66 %mul = mul i32 %vec, %b
77 store i32 %mul, i32 addrspace(1)* %out
1010
1111 ; CHECK: 'mul_v2i32'
1212 ; CHECK: estimated cost of 6 for {{.*}} mul <2 x i32>
13 define void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
13 define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
1414 %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
1515 %mul = mul <2 x i32> %vec, %b
1616 store <2 x i32> %mul, <2 x i32> addrspace(1)* %out
1919
2020 ; CHECK: 'mul_v3i32'
2121 ; CHECK: estimated cost of 9 for {{.*}} mul <3 x i32>
22 define void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
22 define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
2323 %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
2424 %mul = mul <3 x i32> %vec, %b
2525 store <3 x i32> %mul, <3 x i32> addrspace(1)* %out
2828
2929 ; CHECK: 'mul_v4i32'
3030 ; CHECK: estimated cost of 12 for {{.*}} mul <4 x i32>
31 define void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
31 define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
3232 %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
3333 %mul = mul <4 x i32> %vec, %b
3434 store <4 x i32> %mul, <4 x i32> addrspace(1)* %out
3737
3838 ; CHECK: 'mul_i64'
3939 ; CHECK: estimated cost of 16 for {{.*}} mul i64
40 define void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
40 define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
4141 %vec = load i64, i64 addrspace(1)* %vaddr
4242 %mul = mul i64 %vec, %b
4343 store i64 %mul, i64 addrspace(1)* %out
4646
4747 ; CHECK: 'mul_v2i64'
4848 ; CHECK: estimated cost of 32 for {{.*}} mul <2 x i64>
49 define void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
49 define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
5050 %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
5151 %mul = mul <2 x i64> %vec, %b
5252 store <2 x i64> %mul, <2 x i64> addrspace(1)* %out
5555
5656 ; CHECK: 'mul_v3i64'
5757 ; CHECK: estimated cost of 48 for {{.*}} mul <3 x i64>
58 define void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
58 define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
5959 %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
6060 %mul = mul <3 x i64> %vec, %b
6161 store <3 x i64> %mul, <3 x i64> addrspace(1)* %out
6464
6565 ; CHECK: 'mul_v4i64'
6666 ; CHECK: estimated cost of 64 for {{.*}} mul <4 x i64>
67 define void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
67 define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
6868 %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
6969 %mul = mul <4 x i64> %vec, %b
7070 store <4 x i64> %mul, <4 x i64> addrspace(1)* %out
7474
7575 ; CHECK: 'mul_v8i64'
7676 ; CHECK: estimated cost of 128 for {{.*}} mul <8 x i64>
77 define void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
77 define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
7878 %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
7979 %mul = mul <8 x i64> %vec, %b
8080 store <8 x i64> %mul, <8 x i64> addrspace(1)* %out
22
33 ; ALL: 'shl_i32'
44 ; ALL: estimated cost of 1 for {{.*}} shl i32
5 define void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
5 define amdgpu_kernel void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
66 %vec = load i32, i32 addrspace(1)* %vaddr
77 %or = shl i32 %vec, %b
88 store i32 %or, i32 addrspace(1)* %out
1212 ; ALL: 'shl_i64'
1313 ; FAST64: estimated cost of 2 for {{.*}} shl i64
1414 ; SLOW64: estimated cost of 3 for {{.*}} shl i64
15 define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
15 define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
1616 %vec = load i64, i64 addrspace(1)* %vaddr
1717 %or = shl i64 %vec, %b
1818 store i64 %or, i64 addrspace(1)* %out
2121
2222 ; ALL: 'lshr_i32'
2323 ; ALL: estimated cost of 1 for {{.*}} lshr i32
24 define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
24 define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
2525 %vec = load i32, i32 addrspace(1)* %vaddr
2626 %or = lshr i32 %vec, %b
2727 store i32 %or, i32 addrspace(1)* %out
3131 ; ALL: 'lshr_i64'
3232 ; FAST64: estimated cost of 2 for {{.*}} lshr i64
3333 ; SLOW64: estimated cost of 3 for {{.*}} lshr i64
34 define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
34 define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
3535 %vec = load i64, i64 addrspace(1)* %vaddr
3636 %or = lshr i64 %vec, %b
3737 store i64 %or, i64 addrspace(1)* %out
4040
4141 ; ALL: 'ashr_i32'
4242 ; ALL: estimated cost of 1 for {{.*}} ashr i32
43 define void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
43 define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
4444 %vec = load i32, i32 addrspace(1)* %vaddr
4545 %or = ashr i32 %vec, %b
4646 store i32 %or, i32 addrspace(1)* %out
5050 ; ALL: 'ashr_i64'
5151 ; FAST64: estimated cost of 2 for {{.*}} ashr i64
5252 ; SLOW64: estimated cost of 3 for {{.*}} ashr i64
53 define void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
53 define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
5454 %vec = load i64, i64 addrspace(1)* %vaddr
5555 %or = ashr i64 %vec, %b
5656 store i64 %or, i64 addrspace(1)* %out
0 ; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s
11
22 ; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
3 define void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 {
3 define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 {
44 %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
55 store i32 %swizzle, i32 addrspace(1)* %out, align 4
66 ret void
44 ; CHECK: DIVERGENT: %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4
55
66 ; The post dominator tree does not have a root node in this case
7 define void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 {
7 define amdgpu_kernel void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 {
88 bb0:
99 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
1010 %tmp2 = sext i32 %tmp to i64
0 ; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s
11
22 ; CHECK: DIVERGENT: %tmp = cmpxchg volatile
3 define void @unreachable_loop(i32 %tidx) #0 {
3 define amdgpu_kernel void @unreachable_loop(i32 %tidx) #0 {
44 entry:
55 unreachable
66
66 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
77
88 ; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x()
9 define void @workitem_id_x() #1 {
9 define amdgpu_kernel void @workitem_id_x() #1 {
1010 %id.x = call i32 @llvm.amdgcn.workitem.id.x()
1111 store volatile i32 %id.x, i32 addrspace(1)* undef
1212 ret void
1313 }
1414
1515 ; CHECK: DIVERGENT: %id.y = call i32 @llvm.amdgcn.workitem.id.y()
16 define void @workitem_id_y() #1 {
16 define amdgpu_kernel void @workitem_id_y() #1 {
1717 %id.y = call i32 @llvm.amdgcn.workitem.id.y()
1818 store volatile i32 %id.y, i32 addrspace(1)* undef
1919 ret void
2020 }
2121
2222 ; CHECK: DIVERGENT: %id.z = call i32 @llvm.amdgcn.workitem.id.z()
23 define void @workitem_id_z() #1 {
23 define amdgpu_kernel void @workitem_id_z() #1 {
2424 %id.z = call i32 @llvm.amdgcn.workitem.id.z()
2525 store volatile i32 %id.z, i32 addrspace(1)* undef
2626 ret void
2727 }
2828
2929 ; CHECK: DIVERGENT: %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0)
30 define void @mbcnt_lo() #1 {
30 define amdgpu_kernel void @mbcnt_lo() #1 {
3131 %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0)
3232 store volatile i32 %mbcnt.lo, i32 addrspace(1)* undef
3333 ret void
3434 }
3535
3636 ; CHECK: DIVERGENT: %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
37 define void @mbcnt_hi() #1 {
37 define amdgpu_kernel void @mbcnt_hi() #1 {
3838 %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
3939 store volatile i32 %mbcnt.hi, i32 addrspace(1)* undef
4040 ret void
1212 ; FUNC-LABEL: {{^}}local_address_load:
1313 ; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]]
1414 ; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
15 define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
15 define amdgpu_kernel void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
1616 entry:
1717 %0 = load i32, i32 addrspace(3)* %in
1818 store i32 %0, i32 addrspace(1)* %out
2323 ; SI: s_add_i32 [[SPTR:s[0-9]]]
2424 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
2525 ; SI: ds_read_b32 [[VPTR]]
26 define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
26 define amdgpu_kernel void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
2727 entry:
2828 %0 = getelementptr i32, i32 addrspace(3)* %in, i32 %offset
2929 %1 = load i32, i32 addrspace(3)* %0
3434 ; FUNC-LABEL: {{^}}local_address_gep_const_offset:
3535 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
3636 ; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4
37 define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
37 define amdgpu_kernel void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
3838 entry:
3939 %0 = getelementptr i32, i32 addrspace(3)* %in, i32 1
4040 %1 = load i32, i32 addrspace(3)* %0
4747 ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
4848 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
4949 ; SI: ds_read_b32 [[VPTR]]
50 define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
50 define amdgpu_kernel void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
5151 entry:
5252 %0 = getelementptr i32, i32 addrspace(3)* %in, i32 16385
5353 %1 = load i32, i32 addrspace(3)* %0
5959 ; SI: v_cmp_ne_u32
6060 ; SI-NOT: v_cmp_ne_u32
6161 ; SI: v_cndmask_b32
62 define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
62 define amdgpu_kernel void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
6363 %cmp = icmp ne i32 addrspace(3)* %lds, null
6464 %x = select i1 %cmp, i32 123, i32 456
6565 store i32 %x, i32 addrspace(1)* %out
7070 ; SI: s_mul_i32
7171 ; SI-NEXT: s_add_i32
7272 ; SI: ds_read_b32
73 define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
73 define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
7474 %ptr = getelementptr [3 x float], [3 x float] addrspace(3)* %lds, i32 %tid, i32 0
7575 %val = load float, float addrspace(3)* %ptr
7676 store float %val, float addrspace(1)* %out
8282 ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
8383 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
8484 ; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]
85 define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
85 define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
8686 %val = load float, float addrspace(3)* @g_lds
8787 store float %val, float addrspace(1)* %out
8888 ret void
9494
9595 ; FUNC-LABEL: {{^}}global_ptr:
9696 ; SI: ds_write_b32
97 define void @global_ptr() nounwind {
97 define amdgpu_kernel void @global_ptr() nounwind {
9898 store i32 addrspace(3)* getelementptr ([16383 x i32], [16383 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
9999 ret void
100100 }
101101
102102 ; FUNC-LABEL: {{^}}local_address_store:
103103 ; SI: ds_write_b32
104 define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
104 define amdgpu_kernel void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
105105 store i32 %val, i32 addrspace(3)* %out
106106 ret void
107107 }
110110 ; SI: s_add_i32 [[SADDR:s[0-9]+]],
111111 ; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
112112 ; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}}
113 define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
113 define amdgpu_kernel void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
114114 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 %offset
115115 store i32 %val, i32 addrspace(3)* %gep, align 4
116116 ret void
120120 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
121121 ; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
122122 ; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4
123 define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
123 define amdgpu_kernel void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
124124 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 1
125125 store i32 %val, i32 addrspace(3)* %gep, align 4
126126 ret void
131131 ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
132132 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
133133 ; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}}
134 define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
134 define amdgpu_kernel void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
135135 %gep = getelementptr i32, i32 addrspace(3)* %out, i32 16385
136136 store i32 %val, i32 addrspace(3)* %gep, align 4
137137 ret void
33 # REQUIRES: global-isel
44
55 --- |
6 define void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
6 define amdgpu_kernel void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
77 ...
88 ---
99
44 # REQUIRES: global-isel
55
66 --- |
7 define void @smrd_imm(i32 addrspace(2)* %const0) { ret void }
7 define amdgpu_kernel void @smrd_imm(i32 addrspace(2)* %const0) { ret void }
88 ...
99 ---
1010
33 # REQUIRES: global-isel
44
55 --- |
6 define void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
6 define amdgpu_kernel void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
77 ...
88 ---
99
22 # REQUIRES: global-isel
33
44 --- |
5 define void @load_constant(i32 addrspace(2)* %ptr0) { ret void }
6 define void @load_global_uniform(i32 addrspace(1)* %ptr1) {
5 define amdgpu_kernel void @load_constant(i32 addrspace(2)* %ptr0) { ret void }
6 define amdgpu_kernel void @load_global_uniform(i32 addrspace(1)* %ptr1) {
77 %tmp0 = load i32, i32 addrspace(1)* %ptr1
88 ret void
99 }
10 define void @load_global_non_uniform(i32 addrspace(1)* %ptr2) {
10 define amdgpu_kernel void @load_global_non_uniform(i32 addrspace(1)* %ptr2) {
1111 %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
1212 %tmp1 = getelementptr i32, i32 addrspace(1)* %ptr2, i32 %tmp0
1313 %tmp2 = load i32, i32 addrspace(1)* %tmp1
88 ; GCN-LABEL: {{^}}smrd0:
99 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
1010 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
11 define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
11 define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
1212 entry:
1313 %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
1414 %1 = load i32, i32 addrspace(2)* %0
2020 ; GCN-LABEL: {{^}}smrd1:
2121 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
2222 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
23 define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
23 define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
2424 entry:
2525 %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
2626 %1 = load i32, i32 addrspace(2)* %0
3535 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
3636 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
3737 ; GCN: s_endpgm
38 define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
38 define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
3939 entry:
4040 %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
4141 %1 = load i32, i32 addrspace(2)* %0
5050 ; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
5151 ; TODO: Add VI checks
5252 ; XGCN: s_endpgm
53 define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
53 define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
5454 entry:
5555 %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
5656 %1 = load i32, i32 addrspace(2)* %0
6464 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
6565 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
6666 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
67 define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
67 define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
6868 entry:
6969 %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
7070 %1 = load i32, i32 addrspace(2)* %0
7878 ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
7979 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
8080 ; GCN: s_endpgm
81 define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
81 define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
8282 entry:
8383 %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
8484 %1 = load i32, i32 addrspace(2)* %0
22 ; REQUIRES: asserts
33
44 ; Check that SelectionDAGDumper does not crash on int_SI_if.
5 define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
5 define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
66 entry:
77 %0 = icmp eq i64 %a, 0
88 br i1 %0, label %if, label %else
55 ; VI: flat_load_ushort [[B:v[0-9]+]]
66 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
77 ; VI-NEXT: buffer_store_short [[ADD]]
8 define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
8 define amdgpu_kernel void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
99 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1010 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
1111 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
2222 ; VI: flat_load_ushort [[A:v[0-9]+]]
2323 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x7b, [[A]]
2424 ; VI-NEXT: buffer_store_short [[ADD]]
25 define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
25 define amdgpu_kernel void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
2626 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2727 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
2828 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
3737 ; VI: flat_load_ushort [[A:v[0-9]+]]
3838 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xfffffcb3, [[A]]
3939 ; VI-NEXT: buffer_store_short [[ADD]]
40 define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
40 define amdgpu_kernel void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
4141 %tid = call i32 @llvm.amdgcn.workitem.id.x()
4242 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
4343 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
5252 ; VI: flat_load_ushort [[A:v[0-9]+]]
5353 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], -1, [[A]]
5454 ; VI-NEXT: buffer_store_short [[ADD]]
55 define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
55 define amdgpu_kernel void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
5656 %tid = call i32 @llvm.amdgcn.workitem.id.x()
5757 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
5858 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
6868 ; VI: flat_load_ushort [[B:v[0-9]+]]
6969 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
7070 ; VI-NEXT: buffer_store_dword [[ADD]]
71 define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
71 define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
7272 %tid = call i32 @llvm.amdgcn.workitem.id.x()
7373 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
7474 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
8888 ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
8989 ; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
9090 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
91 define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
91 define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
9292 %tid = call i32 @llvm.amdgcn.workitem.id.x()
9393 %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
9494 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
108108 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
109109 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
110110 ; VI-NEXT: buffer_store_dword [[SEXT]]
111 define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
111 define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
112112 %tid = call i32 @llvm.amdgcn.workitem.id.x()
113113 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
114114 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
129129 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
130130 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
131131 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
132 define void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
132 define amdgpu_kernel void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
133133 %tid = call i32 @llvm.amdgcn.workitem.id.x()
134134 %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
135135 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
77 ;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}}
88 ;SI-NOT: [[REG]]
99 ;SI: buffer_store_dword [[REG]],
10 define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
10 define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
1111 %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
1212 %a = load i32, i32 addrspace(1)* %in
1313 %b = load i32, i32 addrspace(1)* %b_ptr
2323 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
2424 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
2525
26 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
26 define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
2727 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
2828 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
2929 %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
4343 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
4444 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
4545
46 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
46 define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
4747 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
4848 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
4949 %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
7070 ; SI: s_add_i32
7171 ; SI: s_add_i32
7272 ; SI: s_add_i32
73 define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
73 define amdgpu_kernel void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
7474 entry:
7575 %0 = add <8 x i32> %a, %b
7676 store <8 x i32> %0, <8 x i32> addrspace(1)* %out
111111 ; SI: s_add_i32
112112 ; SI: s_add_i32
113113 ; SI: s_add_i32
114 define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
114 define amdgpu_kernel void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
115115 entry:
116116 %0 = add <16 x i32> %a, %b
117117 store <16 x i32> %0, <16 x i32> addrspace(1)* %out
128128 ; EG-DAG: ADD_INT
129129 ; EG-DAG: ADD_INT {{[* ]*}}
130130 ; EG-NOT: SUB
131 define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
131 define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
132132 entry:
133133 %0 = add i64 %a, %b
134134 store i64 %0, i64 addrspace(1)* %out
149149 ; EG-DAG: ADD_INT
150150 ; EG-DAG: ADD_INT {{[* ]*}}
151151 ; EG-NOT: SUB
152 define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
152 define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
153153 entry:
154154 %0 = load i64, i64 addrspace(1)* %in
155155 %1 = add i64 %a, %0
168168 ; EG-DAG: ADD_INT
169169 ; EG-DAG: ADD_INT {{[* ]*}}
170170 ; EG-NOT: SUB
171 define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
171 define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
172172 entry:
173173 %0 = icmp eq i64 %a, 0
174174 br i1 %0, label %if, label %else
66
77 ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
88 ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
9 define void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
9 define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
1010 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1111 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
1212 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
2626
2727 ; VI: s_add_i32
2828 ; VI: s_add_i32
29 define void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
29 define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
3030 %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
3131 %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1
3232 %add = add <2 x i16> %a, %b
4040
4141 ; VI: s_add_i32
4242 ; VI: s_add_i32
43 define void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
43 define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
4444 %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
4545 %add = add <2 x i16> %a, %a
4646 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
5353
5454 ; VI: v_add_i32
5555 ; VI: v_add_i32
56 define void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
56 define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
5757 %add = add <2 x i16> %a, %b
5858 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
5959 ret void
6565
6666 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
6767 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x1c8, v{{[0-9]+}}
68 define void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
68 define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
6969 %tid = call i32 @llvm.amdgcn.workitem.id.x()
7070 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
7171 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
8282
8383 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}}
8484 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffc21, v{{[0-9]+}}
85 define void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
85 define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
8686 %tid = call i32 @llvm.amdgcn.workitem.id.x()
8787 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
8888 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
101101 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]]
102102 ; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
103103 ; VI: v_or_b32_e32
104 define void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
104 define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
105105 %tid = call i32 @llvm.amdgcn.workitem.id.x()
106106 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
107107 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
120120 ; VI-NOT: v_add_u16
121121 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
122122 ; VI: v_or_b32_e32
123 define void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
123 define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
124124 %tid = call i32 @llvm.amdgcn.workitem.id.x()
125125 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
126126 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
140140 ; VI-NOT: v_add_u16
141141 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
142142 ; VI: v_or_b32_e32
143 define void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
143 define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
144144 %tid = call i32 @llvm.amdgcn.workitem.id.x()
145145 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
146146 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
172172 ; VI-NOT: and
173173 ; VI-NOT: shl
174174 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
175 define void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
175 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
176176 %tid = call i32 @llvm.amdgcn.workitem.id.x()
177177 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
178178 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
207207 ; VI: v_add_u16_e32
208208
209209 ; VI: buffer_store_dwordx4
210 define void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
210 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
211211 %tid = call i32 @llvm.amdgcn.workitem.id.x()
212212 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
213213 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
235235 ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
236236 ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
237237 ; VI: buffer_store_dwordx2
238 define void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
238 define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
239239 %tid = call i32 @llvm.amdgcn.workitem.id.x()
240240 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
241241 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
263263 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
264264 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
265265 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
266 define void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
266 define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
267267 %tid = call i32 @llvm.amdgcn.workitem.id.x()
268268 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
269269 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
55 ; GCN-NEXT: v_addc_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
66 ; GCN-NEXT: v_addc_u32_e32 v[[HI:[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
77 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]],
8 define void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) {
8 define amdgpu_kernel void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) {
99 %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
1010 %a_ptr = getelementptr i128, i128 addrspace(1)* %inA, i32 %tid
1111 %b_ptr = getelementptr i128, i128 addrspace(1)* %inB, i32 %tid
2222 ; GCN: v_addc_u32
2323 ; GCN: v_addc_u32
2424 ; GCN: v_addc_u32
25 define void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
25 define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
2626 %foo = load i128, i128 addrspace(1)* %in, align 8
2727 %result = add i128 %foo, %a
2828 store i128 %result, i128 addrspace(1)* %out
3434 ; GCN: v_addc_u32
3535 ; GCN: v_addc_u32
3636 ; GCN: v_addc_u32
37 define void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
37 define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
3838 %foo = load i128, i128 addrspace(1)* %in, align 8
3939 %result = add i128 %a, %foo
4040 store i128 %result, i128 addrspace(1)* %out
4646 ; GCN: s_addc_u32
4747 ; GCN: s_addc_u32
4848 ; GCN: s_addc_u32
49 define void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) {
49 define amdgpu_kernel void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) {
5050 %result = add i128 %a, %b
5151 store i128 %result, i128 addrspace(1)* %out
5252 ret void
55 ; SI-LABEL: {{^}}test_i64_vreg:
66 ; SI: v_add_i32
77 ; SI: v_addc_u32
8 define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
8 define amdgpu_kernel void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
99 %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
1010 %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
1111 %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
2020 ; SI-LABEL: {{^}}sgpr_operand:
2121 ; SI: v_add_i32
2222 ; SI: v_addc_u32
23 define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
23 define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
2424 %foo = load i64, i64 addrspace(1)* %in, align 8
2525 %result = add i64 %foo, %a
2626 store i64 %result, i64 addrspace(1)* %out
3333 ; SI-LABEL: {{^}}sgpr_operand_reversed:
3434 ; SI: v_add_i32
3535 ; SI: v_addc_u32
36 define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
36 define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
3737 %foo = load i64, i64 addrspace(1)* %in, align 8
3838 %result = add i64 %a, %foo
3939 store i64 %result, i64 addrspace(1)* %out
4646 ; SI: s_addc_u32
4747 ; SI: s_add_u32
4848 ; SI: s_addc_u32
49 define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
49 define amdgpu_kernel void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
5050 %result = add <2 x i64> %a, %b
5151 store <2 x i64> %result, <2 x i64> addrspace(1)* %out
5252 ret void
5757 ; SI: v_addc_u32
5858 ; SI: v_add_i32
5959 ; SI: v_addc_u32
60 define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
60 define amdgpu_kernel void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
6161 %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
6262 %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
6363 %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
7575 ; SI-NOT: addc
7676 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
7777 ; SI: buffer_store_dword [[VRESULT]],
78 define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
78 define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
7979 %add = add i64 %b, %a
8080 %trunc = trunc i64 %add to i32
8181 store i32 %trunc, i32 addrspace(1)* %out, align 8
88 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
99 ; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
1010 ; CHECK: store i32 %ptr2int, i32 addrspace(1)* %out
11 define void @addrspacecast_captured(i32 addrspace(1)* %out) #0 {
11 define amdgpu_kernel void @addrspacecast_captured(i32 addrspace(1)* %out) #0 {
1212 entry:
1313 %data = alloca i32, align 4
1414 %cast = addrspacecast i32* %data to i32 addrspace(4)*
2121 ; CHECK: %data = alloca i32, align 4
2222 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
2323 ; CHECK: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %out
24 define void @addrspacecast_captured_store(i32 addrspace(4)* addrspace(1)* %out) #0 {
24 define amdgpu_kernel void @addrspacecast_captured_store(i32 addrspace(4)* addrspace(1)* %out) #0 {
2525 entry:
2626 %data = alloca i32, align 4
2727 %cast = addrspacecast i32* %data to i32 addrspace(4)*
3434 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
3535 ; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
3636 ; CHECK: call void @consume_ptr2int(i32 %ptr2int)
37 define void @addrspacecast_captured_call() #0 {
37 define amdgpu_kernel void @addrspacecast_captured_call() #0 {
3838 entry:
3939 %data = alloca i32, align 4
4040 %cast = addrspacecast i32* %data to i32 addrspace(4)*
88 @global.arr = unnamed_addr addrspace(1) global [256 x i32] undef, align 4
99
1010 ; HSA: @store_cast_0_flat_to_group_addrspacecast() #1
11 define void @store_cast_0_flat_to_group_addrspacecast() #1 {
11 define amdgpu_kernel void @store_cast_0_flat_to_group_addrspacecast() #1 {
1212 store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
1313 ret void
1414 }
1515
1616 ; HSA: @store_cast_0_group_to_flat_addrspacecast() #2
17 define void @store_cast_0_group_to_flat_addrspacecast() #1 {
17 define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 {
1818 store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*)
1919 ret void
2020 }
2121
22 ; HSA: define void @store_constant_cast_group_gv_to_flat() #2
23 define void @store_constant_cast_group_gv_to_flat() #1 {
22 ; HSA: define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #2
23 define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 {
2424 store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds.i32 to i32 addrspace(4)*)
2525 ret void
2626 }
2727
2828 ; HSA: @store_constant_cast_group_gv_gep_to_flat() #2
29 define void @store_constant_cast_group_gv_gep_to_flat() #1 {
29 define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat() #1 {
3030 store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
3131 ret void
3232 }
3333
3434 ; HSA: @store_constant_cast_global_gv_to_flat() #1
35 define void @store_constant_cast_global_gv_to_flat() #1 {
35 define amdgpu_kernel void @store_constant_cast_global_gv_to_flat() #1 {
3636 store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global.i32 to i32 addrspace(4)*)
3737 ret void
3838 }
3939
4040 ; HSA: @store_constant_cast_global_gv_gep_to_flat() #1
41 define void @store_constant_cast_global_gv_gep_to_flat() #1 {
41 define amdgpu_kernel void @store_constant_cast_global_gv_gep_to_flat() #1 {
4242 store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(1)* @global.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
4343 ret void
4444 }
4545
4646 ; HSA: @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
47 define void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
47 define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
4848 %val = load i32, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
4949 store i32 %val, i32 addrspace(1)* %out
5050 ret void
5151 }
5252
5353 ; HSA: @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
54 define void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
54 define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
5555 %val = atomicrmw add i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 1 seq_cst
5656 store i32 %val, i32 addrspace(1)* %out
5757 ret void
5858 }
5959
6060 ; HSA: @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
61 define void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
61 define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
6262 %val = cmpxchg i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst
6363 %val0 = extractvalue { i32, i1 } %val, 0
6464 store i32 %val0, i32 addrspace(1)* %out
6666 }
6767
6868 ; HSA: @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
69 define void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
69 define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
7070 call void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* %out, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 32, i32 4, i1 false)
7171 ret void
7272 }
7373
7474 ; Can't just search the pointer value
7575 ; HSA: @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #2
76 define void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 {
76 define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 {
7777 store i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 addrspace(4)* addrspace(1)* %out
7878 ret void
7979 }
8080
8181 ; Can't just search pointer types
8282 ; HSA: @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #2
83 define void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 {
83 define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 {
8484 store i64 ptrtoint (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i64), i64 addrspace(1)* %out
8585 ret void
8686 }
8787
8888 ; Cast group to flat, do GEP, cast back to group
8989 ; HSA: @store_constant_cast_group_gv_gep_to_flat_to_group() #2
90 define void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 {
90 define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 {
9191 store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*)
9292 ret void
9393 }
2727
2828 ; CI: NumSgprs: {{[0-9][0-9]+}}
2929 ; GFX9: NumSgprs: {{[0-9]+}}
30 define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
30 define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
3131 %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
3232 store volatile i32 7, i32 addrspace(4)* %stof
3333 ret void
5757
5858 ; CI: NumSgprs: {{[0-9][0-9]+}}
5959 ; GFX9: NumSgprs: {{[0-9]+}}
60 define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
60 define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
6161 %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
6262 store volatile i32 7, i32 addrspace(4)* %stof
6363 ret void
7272 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
7373 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
7474 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
75 define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
75 define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
7676 %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
7777 store volatile i32 7, i32 addrspace(4)* %stof
7878 ret void
8484 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
8585 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
8686 ; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
87 define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
87 define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
8888 %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
8989 %ld = load volatile i32, i32 addrspace(4)* %stof
9090 ret void
101101 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
102102 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
103103 ; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
104 define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
104 define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
105105 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
106106 store volatile i32 0, i32 addrspace(3)* %ftos
107107 ret void
118118 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]]
119119 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
120120 ; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
121 define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
121 define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
122122 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
123123 store volatile i32 0, i32* %ftos
124124 ret void
132132 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
133133 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
134134 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
135 define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
135 define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
136136 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
137137 store volatile i32 0, i32 addrspace(1)* %ftos
138138 ret void
143143
144144 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
145145 ; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
146 define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
146 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
147147 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
148148 load volatile i32, i32 addrspace(2)* %ftos
149149 ret void
157157 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
158158 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
159159 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
160 define void @cast_0_group_to_flat_addrspacecast() #0 {
160 define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 {
161161 %cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)*
162162 store volatile i32 7, i32 addrspace(4)* %cast
163163 ret void
167167 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
168168 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
169169 ; HSA: ds_write_b32 [[PTR]], [[K]]
170 define void @cast_0_flat_to_group_addrspacecast() #0 {
170 define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
171171 %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)*
172172 store volatile i32 7, i32 addrspace(3)* %cast
173173 ret void
178178 ; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
179179 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
180180 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
181 define void @cast_neg1_group_to_flat_addrspacecast() #0 {
181 define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 {
182182 %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)*
183183 store volatile i32 7, i32 addrspace(4)* %cast
184184 ret void
188188 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
189189 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
190190 ; HSA: ds_write_b32 [[PTR]], [[K]]
191 define void @cast_neg1_flat_to_group_addrspacecast() #0 {
191 define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
192192 %cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)*
193193 store volatile i32 7, i32 addrspace(3)* %cast
194194 ret void
203203 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
204204 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
205205 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
206 define void @cast_0_private_to_flat_addrspacecast() #0 {
206 define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
207207 %cast = addrspacecast i32* null to i32 addrspace(4)*
208208 store volatile i32 7, i32 addrspace(4)* %cast
209209 ret void
213213 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}}
214214 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
215215 ; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
216 define void @cast_0_flat_to_private_addrspacecast() #0 {
216 define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
217217 %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)*
218218 store volatile i32 7, i32* %cast
219219 ret void
225225 ; HSA-LABEL: {{^}}branch_use_flat_i32:
226226 ; HSA: flat_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}
227227 ; HSA: s_endpgm
228 define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
228 define amdgpu_kernel void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
229229 entry:
230230 %cmp = icmp ne i32 %c, 0
231231 br i1 %cmp, label %local, label %global
258258 ; HSA: flat_store_dword
259259 ; HSA: s_barrier
260260 ; HSA: flat_load_dword
261 define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
261 define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
262262 %alloca = alloca i32, i32 9, align 4
263263 %x = call i32 @llvm.amdgcn.workitem.id.x() #2
264264 %pptr = getelementptr i32, i32* %alloca, i32 %x
1515
1616 ; FUNC-LABEL: {{^}}i8ptr_v16i8ptr:
1717 ; SI: s_endpgm
18 define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
18 define amdgpu_kernel void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
1919 entry:
2020 %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
2121 %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0
2323 ret void
2424 }
2525
26 define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
26 define amdgpu_kernel void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
2727 %load = load float, float addrspace(1)* %in, align 4
2828 %fadd32 = fadd float %load, 1.0
2929 %bc = bitcast float %fadd32 to <2 x i16>
3232 ret void
3333 }
3434
35 define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
35 define amdgpu_kernel void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
3636 %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
3737 %add.v2i16 = add <2 x i16> %load,
3838 %bc = bitcast <2 x i16> %add.v2i16 to float
4141 ret void
4242 }
4343
44 define void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
44 define amdgpu_kernel void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
4545 %load = load float, float addrspace(1)* %in, align 4
4646 %fadd32 = fadd float %load, 1.0
4747 %bc = bitcast float %fadd32 to <2 x half>
5050 ret void
5151 }
5252
53 define void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in) nounwind {
53 define amdgpu_kernel void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in) nounwind {
5454 %load = load <2 x half>, <2 x half> addrspace(1)* %in, align 4
5555 %add.v2f16 = fadd <2 x half> %load,
5656 %bc = bitcast <2 x half> %add.v2f16 to float
5959 ret void
6060 }
6161
62 define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
62 define amdgpu_kernel void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
6363 %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
6464 %bc = bitcast <4 x i8> %load to i32
6565 store i32 %bc, i32 addrspace(1)* %out, align 4
6666 ret void
6767 }
6868
69 define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
69 define amdgpu_kernel void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
7070 %load = load i32, i32 addrspace(1)* %in, align 4
7171 %bc = bitcast i32 %load to <4 x i8>
7272 store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
7575
7676 ; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64:
7777 ; SI: s_endpgm
78 define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
78 define amdgpu_kernel void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
7979 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
8080 %add = add <2 x i32> %val,
8181 %bc = bitcast <2 x i32> %add to double
8686
8787 ; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
8888 ; SI: s_endpgm
89 define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
89 define amdgpu_kernel void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
9090 %val = load double, double addrspace(1)* %in, align 8
9191 %add = fadd double %val, 4.0
9292 %bc = bitcast double %add to <2 x i32>
9595 }
9696
9797 ; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64:
98 define void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
98 define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
9999 entry:
100100 %cmp0 = icmp eq i32 %cond, 0
101101 br i1 %cmp0, label %if, label %end
111111 }
112112
113113 ; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64:
114 define void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
114 define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
115115 entry:
116116 %cmp0 = icmp eq i32 %cond, 0
117117 br i1 %cmp0, label %if, label %end
1414 ; GCN-NOT: v0
1515 ; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, v0, v{{[0-9]+}}
1616 ; GCN: buffer_store_dword [[RESULT]]
17 define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
17 define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
1818 entry:
1919 %0 = alloca [2 x i32]
2020 %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
33
44 ; NOOP-LABEL: @noop_fdiv_fpmath(
55 ; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0
6 define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
6 define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
77 %md.25ulp = fdiv float %a, %b, !fpmath !0
88 store volatile float %md.25ulp, float addrspace(1)* %out
99 ret void
1717 ; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
1818 ; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
1919 ; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
20 define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
20 define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
2121 %no.md = fdiv float %a, %b
2222 store volatile float %no.md, float addrspace(1)* %out
2323
5050 ; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
5151 ; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
5252 ; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
53 define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
53 define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
5454 %no.md = fdiv float 1.0, %x
5555 store volatile float %no.md, float addrspace(1)* %out
5656
8888 ; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
8989 ; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
9090 ; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
91 define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
91 define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
9292 %no.md = fdiv <2 x float> %a, %b
9393 store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
9494
119119 ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
120120 ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
121121 ; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
122 define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
122 define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
123123 %no.md = fdiv <2 x float> , %x
124124 store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
125125
157157 ; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
158158 ; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
159159 ; CHECK: store volatile <2 x float> %fast.25ulp
160 define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
160 define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
161161 %no.md = fdiv <2 x float> , %x
162162 store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
163163
185185 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
186186 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
187187 ; CHECK: store volatile <2 x float> %fast.25ulp
188 define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
188 define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
189189 %x.insert = insertelement <2 x float> %x, float 1.0, i32 0
190190
191191 %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
205205 ; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
206206 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
207207 ; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
208 define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
208 define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
209209 %no.md = fdiv float %a, %b
210210 store volatile float %no.md, float addrspace(1)* %out
211211
7979 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !0
8080 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !0
8181 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !0
82 define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
82 define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
8383 entry:
8484 %stack = alloca [5 x i32], align 4
8585 %0 = load i32, i32 addrspace(1)* %in, align 4
101101
102102 ; OPT-LABEL: @high_alignment(
103103 ; OPT: getelementptr inbounds [256 x [8 x i32]], [256 x [8 x i32]] addrspace(3)* @high_alignment.stack, i32 0, i32 %{{[0-9]+}}
104 define void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
104 define amdgpu_kernel void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
105105 entry:
106106 %stack = alloca [8 x i32], align 16
107107 %0 = load i32, i32 addrspace(1)* %in, align 4
126126 ; OPT: alloca [5 x i32]
127127
128128 ; SI-NOT: ds_write
129 define void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
129 define amdgpu_kernel void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
130130 entry:
131131 %stack = alloca [5 x i32], align 4
132132 %0 = load i32, i32 addrspace(1)* %in, align 4
161161 ; SI-NOT: v_movrel
162162 %struct.point = type { i32, i32 }
163163
164 define void @multiple_structs(i32 addrspace(1)* %out) #0 {
164 define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 {
165165 entry:
166166 %a = alloca %struct.point
167167 %b = alloca %struct.point
190190 ; R600-NOT: MOVA_INT
191191 ; SI-NOT: v_movrel
192192
193 define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
193 define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
194194 entry:
195195 %prv_array_const = alloca [2 x i32]
196196 %prv_array = alloca [2 x i32]
234234 ; SI-PROMOTE: s_load_dword [[IDX:s[0-9]+]]
235235 ; SI-PROMOTE: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 16
236236 ; SI-PROMOTE: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[SCALED_IDX]], 16
237 define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
237 define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
238238 entry:
239239 %0 = alloca [2 x i16]
240240 %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
257257
258258 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0
259259 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0
260 define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
260 define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
261261 entry:
262262 %0 = alloca [2 x i8]
263263 %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
280280 ;
281281 ; A total of 5 bytes should be allocated and used.
282282 ; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ;
283 define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
283 define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
284284 entry:
285285 %0 = alloca [3 x i8], align 1
286286 %1 = alloca [2 x i8], align 1
304304 ret void
305305 }
306306
307 define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
307 define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
308308 entry:
309309 %alloca = alloca [2 x [2 x i8]]
310310 %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
318318 ret void
319319 }
320320
321 define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
321 define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
322322 entry:
323323 %alloca = alloca [2 x [2 x i32]]
324324 %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
331331 ret void
332332 }
333333
334 define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
334 define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
335335 entry:
336336 %alloca = alloca [2 x [2 x i64]]
337337 %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
346346
347347 %struct.pair32 = type { i32, i32 }
348348
349 define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
349 define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
350350 entry:
351351 %alloca = alloca [2 x [2 x %struct.pair32]]
352352 %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
359359 ret void
360360 }
361361
362 define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
362 define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
363363 entry:
364364 %alloca = alloca [2 x %struct.pair32]
365365 %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
372372 ret void
373373 }
374374
375 define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
375 define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
376376 entry:
377377 %tmp = alloca [2 x i32]
378378 %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
393393 ; SI-NOT: ds_write
394394 ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
395395 ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 ;
396 define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
396 define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
397397 %alloca = alloca [16 x i32]
398398 %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
399399 store i32 5, i32* %tmp0
409409 ; OPT-LABEL: @pointer_typed_alloca(
410410 ; OPT: getelementptr inbounds [256 x i32 addrspace(1)*], [256 x i32 addrspace(1)*] addrspace(3)* @pointer_typed_alloca.A.addr, i32 0, i32 %{{[0-9]+}}
411411 ; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4
412 define void @pointer_typed_alloca(i32 addrspace(1)* %A) {
412 define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) {
413413 entry:
414414 %A.addr = alloca i32 addrspace(1)*, align 4
415415 store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
461461 ; SI: buffer_load_dword
462462 ; SI: buffer_load_dword
463463
464 define void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
464 define amdgpu_kernel void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
465465 %alloca = alloca [2 x <16 x i32>]
466466 %tmp0 = getelementptr [2 x <16 x i32>], [2 x <16 x i32>]* %alloca, i32 0, i32 %a
467467 %tmp5 = load <16 x i32>, <16 x i32>* %tmp0
505505 ; SI: buffer_load_dword
506506 ; SI: buffer_load_dword
507507
508 define void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
508 define amdgpu_kernel void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
509509 %alloca = alloca [2 x <16 x float>]
510510 %tmp0 = getelementptr [2 x <16 x float>], [2 x <16 x float>]* %alloca, i32 0, i32 %a
511511 %tmp5 = load <16 x float>, <16 x float>* %tmp0
521521 ; SI: buffer_load_dword
522522 ; SI: buffer_load_dword
523523
524 define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
524 define amdgpu_kernel void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
525525 %alloca = alloca [16 x <2 x float>]
526526 %tmp0 = getelementptr [16 x <2 x float>], [16 x <2 x float>]* %alloca, i32 0, i32 %a
527527 %tmp5 = load <2 x float>, <2 x float>* %tmp0
532532 ; OPT-LABEL: @direct_alloca_read_0xi32(
533533 ; OPT: store [0 x i32] undef, [0 x i32] addrspace(3)*
534534 ; OPT: load [0 x i32], [0 x i32] addrspace(3)*
535 define void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) {
535 define amdgpu_kernel void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) {
536536 entry:
537537 %tmp = alloca [0 x i32]
538538 store [0 x i32] [], [0 x i32]* %tmp
544544 ; OPT-LABEL: @direct_alloca_read_1xi32(
545545 ; OPT: store [1 x i32] zeroinitializer, [1 x i32] addrspace(3)*
546546 ; OPT: load [1 x i32], [1 x i32] addrspace(3)*
547 define void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) {
547 define amdgpu_kernel void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) {
548548 entry:
549549 %tmp = alloca [1 x i32]
550550 store [1 x i32] [i32 0], [1 x i32]* %tmp
1111
1212 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
1313 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].X
14 define void @ngroups_x (i32 addrspace(1)* %out) {
14 define amdgpu_kernel void @ngroups_x (i32 addrspace(1)* %out) {
1515 entry:
1616 %0 = call i32 @llvm.r600.read.ngroups.x() #0
1717 store i32 %0, i32 addrspace(1)* %out
2626
2727 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
2828 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y
29 define void @ngroups_y (i32 addrspace(1)* %out) {
29 define amdgpu_kernel void @ngroups_y (i32 addrspace(1)* %out) {
3030 entry:
3131 %0 = call i32 @llvm.r600.read.ngroups.y() #0
3232 store i32 %0, i32 addrspace(1)* %out
4141
4242 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
4343 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z
44 define void @ngroups_z (i32 addrspace(1)* %out) {
44 define amdgpu_kernel void @ngroups_z (i32 addrspace(1)* %out) {
4545 entry:
4646 %0 = call i32 @llvm.r600.read.ngroups.z() #0
4747 store i32 %0, i32 addrspace(1)* %out
5656
5757 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
5858 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
59 define void @global_size_x (i32 addrspace(1)* %out) {
59 define amdgpu_kernel void @global_size_x (i32 addrspace(1)* %out) {
6060 entry:
6161 %0 = call i32 @llvm.r600.read.global.size.x() #0
6262 store i32 %0, i32 addrspace(1)* %out
7171
7272 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
7373 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
74 define void @global_size_y (i32 addrspace(1)* %out) {
74 define amdgpu_kernel void @global_size_y (i32 addrspace(1)* %out) {
7575 entry:
7676 %0 = call i32 @llvm.r600.read.global.size.y() #0
7777 store i32 %0, i32 addrspace(1)* %out
8686
8787 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
8888 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
89 define void @global_size_z (i32 addrspace(1)* %out) {
89 define amdgpu_kernel void @global_size_z (i32 addrspace(1)* %out) {
9090 entry:
9191 %0 = call i32 @llvm.r600.read.global.size.z() #0
9292 store i32 %0, i32 addrspace(1)* %out
101101
102102 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
103103 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Z
104 define void @local_size_x (i32 addrspace(1)* %out) {
104 define amdgpu_kernel void @local_size_x (i32 addrspace(1)* %out) {
105105 entry:
106106 %0 = call i32 @llvm.r600.read.local.size.x() #0
107107 store i32 %0, i32 addrspace(1)* %out
116116
117117 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
118118 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].W
119 define void @local_size_y (i32 addrspace(1)* %out) {
119 define amdgpu_kernel void @local_size_y (i32 addrspace(1)* %out) {
120120 entry:
121121 %0 = call i32 @llvm.r600.read.local.size.y() #0
122122 store i32 %0, i32 addrspace(1)* %out
131131
132132 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
133133 ; EG: MOV {{\*? *}}[[VAL]], KC0[2].X
134 define void @local_size_z (i32 addrspace(1)* %out) {
134 define amdgpu_kernel void @local_size_z (i32 addrspace(1)* %out) {
135135 entry:
136136 %0 = call i32 @llvm.r600.read.local.size.z() #0
137137 store i32 %0, i32 addrspace(1)* %out
152152 ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
153153 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
154154 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
155 define void @tgid_x_legacy(i32 addrspace(1)* %out) {
155 define amdgpu_kernel void @tgid_x_legacy(i32 addrspace(1)* %out) {
156156 entry:
157157 %0 = call i32 @llvm.r600.read.tgid.x() #0
158158 store i32 %0, i32 addrspace(1)* %out
164164 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
165165
166166 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
167 define void @tgid_y_legacy(i32 addrspace(1)* %out) {
167 define amdgpu_kernel void @tgid_y_legacy(i32 addrspace(1)* %out) {
168168 entry:
169169 %0 = call i32 @llvm.r600.read.tgid.y() #0
170170 store i32 %0, i32 addrspace(1)* %out
180180 ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
181181 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
182182 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
183 define void @tgid_z_legacy(i32 addrspace(1)* %out) {
183 define amdgpu_kernel void @tgid_z_legacy(i32 addrspace(1)* %out) {
184184 entry:
185185 %0 = call i32 @llvm.r600.read.tgid.z() #0
186186 store i32 %0, i32 addrspace(1)* %out
193193
194194 ; FUNC-LABEL: {{^}}tidig_x_legacy:
195195 ; GCN-NOHSA: buffer_store_dword v0
196 define void @tidig_x_legacy(i32 addrspace(1)* %out) {
196 define amdgpu_kernel void @tidig_x_legacy(i32 addrspace(1)* %out) {
197197 entry:
198198 %0 = call i32 @llvm.r600.read.tidig.x() #0
199199 store i32 %0, i32 addrspace(1)* %out
207207 ; FUNC-LABEL: {{^}}tidig_y_legacy:
208208
209209 ; GCN-NOHSA: buffer_store_dword v1
210 define void @tidig_y_legacy(i32 addrspace(1)* %out) {
210 define amdgpu_kernel void @tidig_y_legacy(i32 addrspace(1)* %out) {
211211 entry:
212212 %0 = call i32 @llvm.r600.read.tidig.y() #0
213213 store i32 %0, i32 addrspace(1)* %out
220220
221221 ; FUNC-LABEL: {{^}}tidig_z_legacy:
222222 ; GCN-NOHSA: buffer_store_dword v2
223 define void @tidig_z_legacy(i32 addrspace(1)* %out) {
223 define amdgpu_kernel void @tidig_z_legacy(i32 addrspace(1)* %out) {
224224 entry:
225225 %0 = call i32 @llvm.r600.read.tidig.z() #0
226226 store i32 %0, i32 addrspace(1)* %out
33 ; FUNC-LABEL: {{^}}v_and_i64_br:
44 ; SI: v_and_b32
55 ; SI: v_and_b32
6 define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
6 define amdgpu_kernel void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
77 entry:
88 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
99 %tmp0 = icmp eq i32 %tid, 0
1010 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
1111 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
1212
13 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
13 define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
1414 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
1515 %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
1616 %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
3030 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
3131 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
3232
33 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
33 define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
3434 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
3535 %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
3636 %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
4141
4242 ; FUNC-LABEL: {{^}}s_and_i32:
4343 ; SI: s_and_b32
44 define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
44 define amdgpu_kernel void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
4545 %and = and i32 %a, %b
4646 store i32 %and, i32 addrspace(1)* %out, align 4
4747 ret void
4949
5050 ; FUNC-LABEL: {{^}}s_and_constant_i32:
5151 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687
52 define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
52 define amdgpu_kernel void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
5353 %and = and i32 %a, 1234567
5454 store i32 %and, i32 addrspace(1)* %out, align 4
5555 ret void
6565 ; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]]
6666 ; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
6767 ; SI: buffer_store_dword [[VK]]
68 define void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
68 define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
6969 %and = and i32 %a, 1234567
7070
7171 ; Just to stop future replacement of copy to vgpr + store with VALU op.
8282 ; SI: s_add_i32
8383 ; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, [[K]]
8484 ; SI: buffer_store_dword [[VK]]
85 define void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {
85 define amdgpu_kernel void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {
8686 %and = and i32 %a, 1234567
8787 %foo = add i32 %and, 1234567
8888 %bar = add i32 %foo, %b
9292
9393 ; FUNC-LABEL: {{^}}v_and_i32_vgpr_vgpr:
9494 ; SI: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
95 define void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
95 define amdgpu_kernel void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
9696 %tid = call i32 @llvm.r600.read.tidig.x() #0
9797 %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
9898 %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
108108 ; SI-DAG: s_load_dword [[SA:s[0-9]+]]
109109 ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]]
110110 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]]
111 define void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) {
111 define amdgpu_kernel void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) {
112112 %tid = call i32 @llvm.r600.read.tidig.x() #0
113113 %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
114114 %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
122122 ; SI-DAG: s_load_dword [[SA:s[0-9]+]]
123123 ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]]
124124 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]]
125 define void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) {
125 define amdgpu_kernel void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) {
126126 %tid = call i32 @llvm.r600.read.tidig.x() #0
127127 %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
128128 %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
134134
135135 ; FUNC-LABEL: {{^}}v_and_constant_i32
136136 ; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}}
137 define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
137 define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
138138 %a = load i32, i32 addrspace(1)* %aptr, align 4
139139 %and = and i32 %a, 1234567
140140 store i32 %and, i32 addrspace(1)* %out, align 4
143143
144144 ; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32
145145 ; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}}
146 define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
146 define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
147147 %a = load i32, i32 addrspace(1)* %aptr, align 4
148148 %and = and i32 %a, 64
149149 store i32 %and, i32 addrspace(1)* %out, align 4
152152
153153 ; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32
154154 ; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}}
155 define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
155 define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
156156 %a = load i32, i32 addrspace(1)* %aptr, align 4
157157 %and = and i32 %a, -16
158158 store i32 %and, i32 addrspace(1)* %out, align 4
161161
162162 ; FUNC-LABEL: {{^}}s_and_i64
163163 ; SI: s_and_b64
164 define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
164 define amdgpu_kernel void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
165165 %and = and i64 %a, %b
166166 store i64 %and, i64 addrspace(1)* %out, align 8
167167 ret void
170170 ; FIXME: Should use SGPRs
171171 ; FUNC-LABEL: {{^}}s_and_i1:
172172 ; SI: v_and_b32
173 define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
173 define amdgpu_kernel void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
174174 %and = and i1 %a, %b
175175 store i1 %and, i1 addrspace(1)* %out
176176 ret void
180180 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000{{$}}
181181 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80{{$}}
182182 ; SI: buffer_store_dwordx2
183 define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
183 define amdgpu_kernel void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
184184 %and = and i64 %a, 549756338176
185185 store i64 %and, i64 addrspace(1)* %out, align 8
186186 ret void
190190 ; XSI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x80000{{$}}
191191 ; XSI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0x80{{$}}
192192 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}
193 define void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
193 define amdgpu_kernel void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
194194 %and0 = and i64 %a, 549756338176
195195 %and1 = and i64 %b, 549756338176
196196 store volatile i64 %and0, i64 addrspace(1)* %out
204204 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}}
205205 ; SI-NOT: and
206206 ; SI: buffer_store_dwordx2
207 define void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
207 define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
208208 %and = and i64 %a, 1234567
209209 store i64 %and, i64 addrspace(1)* %out, align 8
210210 ret void
222222 ; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
223223 ; SI-NOT: and
224224 ; SI: buffer_store_dwordx2
225 define void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
225 define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
226226 %shl.a = shl i64 %a, 1
227227 %shl.b = shl i64 %b, 1
228228 %and0 = and i64 %shl.a, 62
237237 ; FUNC-LABEL: {{^}}v_and_i64:
238238 ; SI: v_and_b32
239239 ; SI: v_and_b32
240 define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
240 define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
241241 %a = load i64, i64 addrspace(1)* %aptr, align 8
242242 %b = load i64, i64 addrspace(1)* %bptr, align 8
243243 %and = and i64 %a, %b
249249 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, {{v[0-9]+}}
250250 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}}
251251 ; SI: buffer_store_dwordx2
252 define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
252 define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
253253 %a = load i64, i64 addrspace(1)* %aptr, align 8
254254 %and = and i64 %a, 1231231234567
255255 store i64 %and, i64 addrspace(1)* %out, align 8
267267 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI1]]
268268 ; SI: buffer_store_dwordx2
269269 ; SI: buffer_store_dwordx2
270 define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
270 define amdgpu_kernel void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
271271 %a = load volatile i64, i64 addrspace(1)* %aptr
272272 %b = load volatile i64, i64 addrspace(1)* %aptr
273273 %and0 = and i64 %a, 1231231234567
287287 ; SI-NOT: and
288288 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]]
289289 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]]
290 define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
290 define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
291291 %a = load volatile i64, i64 addrspace(1)* %aptr
292292 %b = load volatile i64, i64 addrspace(1)* %aptr
293293 %and0 = and i64 %a, 63
303303 ; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]]
304304 ; SI-NOT: and
305305 ; SI: buffer_store_dwordx2
306 define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
306 define amdgpu_kernel void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
307307 %a = load i64, i64 addrspace(1)* %aptr, align 8
308308 %and = and i64 %a, 1234567
309309 store i64 %and, i64 addrspace(1)* %out, align 8
316316 ; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
317317 ; SI-NOT: and
318318 ; SI: buffer_store_dwordx2
319 define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
319 define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
320320 %a = load i64, i64 addrspace(1)* %aptr, align 8
321321 %and = and i64 %a, 64
322322 store i64 %and, i64 addrspace(1)* %out, align 8
330330 ; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]]
331331 ; SI-NOT: and
332332 ; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
333 define void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
333 define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
334334 %a = load i64, i64 addrspace(1)* %aptr, align 8
335335 %and = and i64 %a, -8
336336 store i64 %and, i64 addrspace(1)* %out, align 8
343343 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 64
344344 ; SI-NOT: and
345345 ; SI: buffer_store_dword
346 define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
346 define amdgpu_kernel void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
347347 %and = and i64 %a, 64
348348 store i64 %and, i64 addrspace(1)* %out, align 8
349349 ret void
357357 ; SI-NOT: and
358358 ; SI: s_add_u32
359359 ; SI-NEXT: s_addc_u32
360 define void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
360 define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
361361 %shl = shl i64 %a, 1
362362 %and = and i64 %shl, 64
363363 %add = add i64 %and, %b
371371 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
372372 ; SI-NOT: and
373373 ; SI: buffer_store_dwordx2
374 define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
374 define amdgpu_kernel void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
375375 %and = and i64 %a, 1
376376 store i64 %and, i64 addrspace(1)* %out, align 8
377377 ret void
386386 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000
387387 ; SI-NOT: and
388388 ; SI: buffer_store_dwordx2
389 define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
389 define amdgpu_kernel void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
390390 %and = and i64 %a, 4607182418800017408
391391 store i64 %and, i64 addrspace(1)* %out, align 8
392392 ret void
401401 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000
402402 ; SI-NOT: and
403403 ; SI: buffer_store_dwordx2
404 define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
404 define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
405405 %and = and i64 %a, 13830554455654793216
406406 store i64 %and, i64 addrspace(1)* %out, align 8
407407 ret void
416416 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000
417417 ; SI-NOT: and
418418 ; SI: buffer_store_dwordx2
419 define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
419 define amdgpu_kernel void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
420420 %and = and i64 %a, 4602678819172646912
421421 store i64 %and, i64 addrspace(1)* %out, align 8
422422 ret void
431431 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000
432432 ; SI-NOT: and
433433 ; SI: buffer_store_dwordx2
434 define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
434 define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
435435 %and = and i64 %a, 13826050856027422720
436436 store i64 %and, i64 addrspace(1)* %out, align 8
437437 ret void
444444 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0
445445 ; SI-NOT: and
446446 ; SI: buffer_store_dwordx2
447 define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
447 define amdgpu_kernel void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
448448 %and = and i64 %a, 4611686018427387904
449449 store i64 %and, i64 addrspace(1)* %out, align 8
450450 ret void
457457 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0
458458 ; SI-NOT: and
459459 ; SI: buffer_store_dwordx2
460 define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
460 define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
461461 %and = and i64 %a, 13835058055282163712
462462 store i64 %and, i64 addrspace(1)* %out, align 8
463463 ret void
472472 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000
473473 ; SI-NOT: and
474474 ; SI: buffer_store_dwordx2
475 define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
475 define amdgpu_kernel void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
476476 %and = and i64 %a, 4616189618054758400
477477 store i64 %and, i64 addrspace(1)* %out, align 8
478478 ret void
487487 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000
488488 ; SI-NOT: and
489489 ; SI: buffer_store_dwordx2
490 define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
490 define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
491491 %and = and i64 %a, 13839561654909534208
492492 store i64 %and, i64 addrspace(1)* %out, align 8
493493 ret void
504504 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
505505 ; SI-NOT: and
506506 ; SI: buffer_store_dwordx2
507 define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
507 define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
508508 %and = and i64 %a, 1082130432
509509 store i64 %and, i64 addrspace(1)* %out, align 8
510510 ret void
517517 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
518518 ; SI-NOT: and
519519 ; SI: buffer_store_dwordx2
520 define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
520 define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
521521 %and = and i64 %a, -1065353216
522522 store i64 %and, i64 addrspace(1)* %out, align 8
523523 ret void
530530 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
531531 ; SI-NOT: and
532532 ; SI: buffer_store_dwordx2
533 define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
533 define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
534534 %and = and i64 %a, 4647714815446351872
535535 store i64 %and, i64 addrspace(1)* %out, align 8
536536 ret void
543543 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
544544 ; SI-NOT: and
545545 ; SI: buffer_store_dwordx2
546 define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
546 define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
547547 %and = and i64 %a, 13871086852301127680
548548 store i64 %and, i64 addrspace(1)* %out, align 8
549549 ret void
1010 declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
1111 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
1212
13 ; HSA: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
14 define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
13 ; HSA: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
14 define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
1515 %val = call i32 @llvm.amdgcn.workgroup.id.x()
1616 store i32 %val, i32 addrspace(1)* %ptr
1717 ret void
1818 }
1919
20 ; HSA: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
21 define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
20 ; HSA: define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
21 define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
2222 %val = call i32 @llvm.amdgcn.workgroup.id.y()
2323 store i32 %val, i32 addrspace(1)* %ptr
2424 ret void
2525 }
2626
27 ; HSA: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
28 define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
27 ; HSA: define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
28 define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
2929 %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
3030 store volatile i32 %val0, i32 addrspace(1)* %ptr
3131 %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
3333 ret void
3434 }
3535
36 ; HSA: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
37 define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
36 ; HSA: define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
37 define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
3838 %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
3939 %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
4040 store volatile i32 %val0, i32 addrspace(1)* %ptr
4242 ret void
4343 }
4444
45 ; HSA: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
46 define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
45 ; HSA: define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
46 define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
4747 %val = call i32 @llvm.amdgcn.workgroup.id.z()
4848 store i32 %val, i32 addrspace(1)* %ptr
4949 ret void
5050 }
5151
52 ; HSA: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
53 define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
52 ; HSA: define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
53 define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
5454 %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
5555 %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
5656 store volatile i32 %val0, i32 addrspace(1)* %ptr
5858 ret void
5959 }
6060
61 ; HSA: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
62 define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
61 ; HSA: define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
62 define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
6363 %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
6464 %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
6565 store volatile i32 %val0, i32 addrspace(1)* %ptr
6767 ret void
6868 }
6969
70 ; HSA: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
71 define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
70 ; HSA: define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
71 define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
7272 %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
7373 %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
7474 %val2 = call i32 @llvm.amdgcn.workgroup.id.z()
7878 ret void
7979 }
8080
81 ; HSA: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
82 define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
81 ; HSA: define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
82 define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
8383 %val = call i32 @llvm.amdgcn.workitem.id.x()
8484 store i32 %val, i32 addrspace(1)* %ptr
8585 ret void
8686 }
8787
88 ; HSA: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
89 define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
88 ; HSA: define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
89 define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
9090 %val = call i32 @llvm.amdgcn.workitem.id.y()
9191 store i32 %val, i32 addrspace(1)* %ptr
9292 ret void
9393 }
9494
95 ; HSA: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
96 define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
95 ; HSA: define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
96 define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
9797 %val = call i32 @llvm.amdgcn.workitem.id.z()
9898 store i32 %val, i32 addrspace(1)* %ptr
9999 ret void
100100 }
101101
102 ; HSA: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
103 define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
102 ; HSA: define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
103 define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
104104 %val0 = call i32 @llvm.amdgcn.workitem.id.x()
105105 %val1 = call i32 @llvm.amdgcn.workgroup.id.x()
106106 store volatile i32 %val0, i32 addrspace(1)* %ptr
108108 ret void
109109 }
110110
111 ; HSA: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
112 define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
111 ; HSA: define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
112 define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
113113 %val0 = call i32 @llvm.amdgcn.workitem.id.y()
114114 %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
115115 store volatile i32 %val0, i32 addrspace(1)* %ptr
117117 ret void
118118 }
119119
120 ; HSA: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
121 define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
120 ; HSA: define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
121 define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
122122 %val0 = call i32 @llvm.amdgcn.workitem.id.x()
123123 %val1 = call i32 @llvm.amdgcn.workitem.id.y()
124124 %val2 = call i32 @llvm.amdgcn.workitem.id.z()
128128 ret void
129129 }
130130
131 ; HSA: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
132 define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
131 ; HSA: define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
132 define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
133133 %val0 = call i32 @llvm.amdgcn.workitem.id.x()
134134 %val1 = call i32 @llvm.amdgcn.workitem.id.y()
135135 %val2 = call i32 @llvm.amdgcn.workitem.id.z()
145145 ret void
146146 }
147147
148 ; HSA: define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 {
149 define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
148 ; HSA: define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 {
149 define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
150150 %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
151151 %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
152152 %val = load i32, i32 addrspace(2)* %bc
154154 ret void
155155 }
156156
157 ; HSA: define void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 {
158 define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
157 ; HSA: define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 {
158 define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
159159 %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
160160 %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
161161 %val = load i32, i32 addrspace(2)* %bc
163163 ret void
164164 }
165165
166 ; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
167 define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
166 ; HSA: define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
167 define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
168168 %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
169169 store volatile i32 0, i32 addrspace(4)* %stof
170170 ret void
171171 }
172172
173 ; HSA: define void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
174 define void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
173 ; HSA: define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
174 define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
175175 %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
176176 store volatile i32 0, i32 addrspace(4)* %stof
177177 ret void
178178 }
179179
180 ; HSA: define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
181 define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
180 ; HSA: define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
181 define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
182182 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
183183 store volatile i32 0, i32 addrspace(3)* %ftos
184184 ret void
185185 }
186186
187 ; HSA: define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
188 define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
187 ; HSA: define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
188 define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
189189 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
190190 store volatile i32 0, i32* %ftos
191191 ret void
192192 }
193193
194194 ; No-op addrspacecast should not use queue ptr
195 ; HSA: define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
196 define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
195 ; HSA: define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
196 define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
197197 %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
198198 store volatile i32 0, i32 addrspace(4)* %stof
199199 ret void
200200 }
201201
202 ; HSA: define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
203 define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
202 ; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
203 define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
204204 %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
205205 %ld = load volatile i32, i32 addrspace(4)* %stof
206206 ret void
207207 }
208208
209 ; HSA: define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
210 define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
209 ; HSA: define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
210 define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
211211 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
212212 store volatile i32 0, i32 addrspace(1)* %ftos
213213 ret void
214214 }
215215
216 ; HSA: define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
217 define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
216 ; HSA: define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
217 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
218218 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
219219 %ld = load volatile i32, i32 addrspace(2)* %ftos
220220 ret void
1111 declare i32 @llvm.r600.read.local.size.y() #0
1212 declare i32 @llvm.r600.read.local.size.z() #0
1313
14 ; ALL: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
15 define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
14 ; ALL: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
15 define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
1616 %val = call i32 @llvm.r600.read.tgid.x()
1717 store i32 %val, i32 addrspace(1)* %ptr
1818 ret void
1919 }
2020
21 ; ALL: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
22 define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
21 ; ALL: define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
22 define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
2323 %val = call i32 @llvm.r600.read.tgid.y()
2424 store i32 %val, i32 addrspace(1)* %ptr
2525 ret void
2626 }
2727
28 ; ALL: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
29 define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
28 ; ALL: define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
29 define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
3030 %val0 = call i32 @llvm.r600.read.tgid.y()
3131 store volatile i32 %val0, i32 addrspace(1)* %ptr
3232 %val1 = call i32 @llvm.r600.read.tgid.y()
3434 ret void
3535 }
3636
37 ; ALL: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
38 define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
37 ; ALL: define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
38 define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
3939 %val0 = call i32 @llvm.r600.read.tgid.x()
4040 %val1 = call i32 @llvm.r600.read.tgid.y()
4141 store volatile i32 %val0, i32 addrspace(1)* %ptr
4343 ret void
4444 }
4545
46 ; ALL: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
47 define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
46 ; ALL: define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
47 define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
4848 %val = call i32 @llvm.r600.read.tgid.z()
4949 store i32 %val, i32 addrspace(1)* %ptr
5050 ret void
5151 }
5252
53 ; ALL: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
54 define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
53 ; ALL: define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
54 define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
5555 %val0 = call i32 @llvm.r600.read.tgid.x()
5656 %val1 = call i32 @llvm.r600.read.tgid.z()
5757 store volatile i32 %val0, i32 addrspace(1)* %ptr
5959 ret void
6060 }
6161
62 ; ALL: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
63 define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
62 ; ALL: define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
63 define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
6464 %val0 = call i32 @llvm.r600.read.tgid.y()
6565 %val1 = call i32 @llvm.r600.read.tgid.z()
6666 store volatile i32 %val0, i32 addrspace(1)* %ptr
6868 ret void
6969 }
7070
71 ; ALL: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
72 define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
71 ; ALL: define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
72 define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
7373 %val0 = call i32 @llvm.r600.read.tgid.x()
7474 %val1 = call i32 @llvm.r600.read.tgid.y()
7575 %val2 = call i32 @llvm.r600.read.tgid.z()
7979 ret void
8080 }
8181
82 ; ALL: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
83 define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
82 ; ALL: define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
83 define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
8484 %val = call i32 @llvm.r600.read.tidig.x()
8585 store i32 %val, i32 addrspace(1)* %ptr
8686 ret void
8787 }
8888
89 ; ALL: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
90 define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
89 ; ALL: define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
90 define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
9191 %val = call i32 @llvm.r600.read.tidig.y()
9292 store i32 %val, i32 addrspace(1)* %ptr
9393 ret void
9494 }
9595
96 ; ALL: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
97 define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
96 ; ALL: define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
97 define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
9898 %val = call i32 @llvm.r600.read.tidig.z()
9999 store i32 %val, i32 addrspace(1)* %ptr
100100 ret void
101101 }
102102
103 ; ALL: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
104 define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
103 ; ALL: define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
104 define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
105105 %val0 = call i32 @llvm.r600.read.tidig.x()
106106 %val1 = call i32 @llvm.r600.read.tgid.x()
107107 store volatile i32 %val0, i32 addrspace(1)* %ptr
109109 ret void
110110 }
111111
112 ; ALL: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
113 define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
112 ; ALL: define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
113 define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
114114 %val0 = call i32 @llvm.r600.read.tidig.y()
115115 %val1 = call i32 @llvm.r600.read.tgid.y()
116116 store volatile i32 %val0, i32 addrspace(1)* %ptr
118118 ret void
119119 }
120120
121 ; ALL: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
122 define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
121 ; ALL: define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
122 define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
123123 %val0 = call i32 @llvm.r600.read.tidig.x()
124124 %val1 = call i32 @llvm.r600.read.tidig.y()
125125 %val2 = call i32 @llvm.r600.read.tidig.z()
129129 ret void
130130 }
131131
132 ; ALL: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
133 define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
132 ; ALL: define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
133 define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
134134 %val0 = call i32 @llvm.r600.read.tidig.x()
135135 %val1 = call i32 @llvm.r600.read.tidig.y()
136136 %val2 = call i32 @llvm.r600.read.tidig.z()
146146 ret void
147147 }
148148
149 ; HSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 {
150 ; NOHSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
151 define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
149 ; HSA: define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 {
150 ; NOHSA: define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
151 define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
152152 %val = call i32 @llvm.r600.read.local.size.x()
153153 store i32 %val, i32 addrspace(1)* %ptr
154154 ret void
155155 }
156156
157 ; HSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 {
158 ; NOHSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
159 define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
157 ; HSA: define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 {
158 ; NOHSA: define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
159 define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
160160 %val = call i32 @llvm.r600.read.local.size.y()
161161 store i32 %val, i32 addrspace(1)* %ptr
162162 ret void
163163 }
164164
165 ; HSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 {
166 ; NOHSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
167 define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
165 ; HSA: define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 {
166 ; NOHSA: define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
167 define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
168168 %val = call i32 @llvm.r600.read.local.size.z()
169169 store i32 %val, i32 addrspace(1)* %ptr
170170 ret void
55 ; CHECK-LABEL: {{^}}test:
66 ; CHECK: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, __unnamed_1
77 ; CHECK: s_endpgm
8 define void @test() {
8 define amdgpu_kernel void @test() {
99 store i32 1, i32 addrspace(1)* @0
1010 ret void
1111 }
1212
1313 ; CHECK-LABEL: {{^}}__unnamed_2:
1414 ; CHECK: s_endpgm
15 define void @1() {
15 define amdgpu_kernel void @1() {
1616 ret void
1717 }
55
66 ; GCN-LABEL: {{^}}anyext_i1_i32:
77 ; GCN: v_cndmask_b32_e64
8 define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
8 define amdgpu_kernel void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
99 entry:
1010 %tmp = icmp eq i32 %cond, 0
1111 %tmp1 = zext i1 %tmp to i8
2121 ; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], -1, [[ADD]]
2222 ; VI: v_and_b32_e32 [[AND:v[0-9]+]], 1, [[XOR]]
2323 ; VI: buffer_store_dword [[AND]]
24 define void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) {
24 define amdgpu_kernel void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) {
2525 entry:
2626 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
2727 %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
2323
2424 ; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64
2525 ; SI-PROMOTE: ds_write_b32 [[PTRREG]]
26 define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
26 define amdgpu_kernel void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
2727 %alloca = alloca [16 x i32], align 16
2828 %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
2929 %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
66 ; SI-DAG: v_mul_lo_i32
77 ; SI-DAG: v_mul_hi_i32
88 ; SI: s_endpgm
9 define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
9 define amdgpu_kernel void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
1010 %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1111 %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
1212 %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0