llvm.org GIT mirror llvm / ec0a7cd
R600/SI: Remove i1 pseudo VALU ops Select i1 logical ops directly to 64-bit SALU instructions. Vector i1 values are always really in SGPRs, with each bit for each item in the wave. This saves about 4 instructions when and/or/xoring any condition, and also helps write conditions that need to be passed in vcc. This should work correctly now that the SGPR live range fixing pass works. More work is needed to eliminate the VReg_1 pseudo regclass and possibly the entire SILowerI1Copies pass. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@223206 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 5 years ago
10 changed file(s) with 339 addition(s) and 124 deletion(s). Raw diff Collapse all Expand all
128128
129129 def as_i32imm: SDNodeXForm
130130 return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i32);
131 }]>;
132
133 def as_i64imm: SDNodeXForm
134 return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i64);
131135 }]>;
132136
133137 def IMM8bit : PatLeaf <(imm),
16851685 //===----------------------------------------------------------------------===//
16861686 // Pseudo Instructions
16871687 //===----------------------------------------------------------------------===//
1688
16891688 let isCodeGenOnly = 1, isPseudo = 1 in {
1690
1691 def V_MOV_I1 : InstSI <
1692 (outs VReg_1:$dst),
1693 (ins i1imm:$src),
1694 "", [(set i1:$dst, (imm:$src))]
1695 >;
1696
1697 def V_AND_I1 : InstSI <
1698 (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
1699 [(set i1:$dst, (and i1:$src0, i1:$src1))]
1700 >;
1701
1702 def V_OR_I1 : InstSI <
1703 (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
1704 [(set i1:$dst, (or i1:$src0, i1:$src1))]
1705 >;
1706
1707 def V_XOR_I1 : InstSI <
1708 (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
1709 [(set i1:$dst, (xor i1:$src0, i1:$src1))]
1710 >;
17111689
17121690 let hasSideEffects = 1 in {
17131691 def SGPR_USE : InstSI <(outs),(ins), "", []>;
24942472 (S_MOV_B64 InlineImm:$imm)
24952473 >;
24962474
2475 // XXX - Should this use a s_cmp to set SCC?
2476
2477 // Set to sign-extended 64-bit value (true = -1, false = 0)
2478 def : Pat <
2479 (i1 imm:$imm),
2480 (S_MOV_B64 (i64 (as_i64imm $imm)))
2481 >;
2482
24972483 /********** ===================== **********/
24982484 /********** Interpolation Paterns **********/
24992485 /********** ===================== **********/
30443030 (V_CNDMASK_B32_e64 0, -1, $src), sub1)
30453031 >;
30463032
3033 // If we need to perform a logical operation on i1 values, we need to
3034 // use vector comparisons since there is only one SCC register. Vector
3035 // comparisions still write to a pair of SGPRs, so treat these as
3036 // 64-bit comparisons. When legalizing SGPR copies, instructions
3037 // resulting in the copies from SCC to these instructions will be
3038 // moved to the VALU.
3039 def : Pat <
3040 (i1 (and i1:$src0, i1:$src1)),
3041 (S_AND_B64 $src0, $src1)
3042 >;
3043
3044 def : Pat <
3045 (i1 (or i1:$src0, i1:$src1)),
3046 (S_OR_B64 $src0, $src1)
3047 >;
3048
3049 def : Pat <
3050 (i1 (xor i1:$src0, i1:$src1)),
3051 (S_XOR_B64 $src0, $src1)
3052 >;
3053
30473054 def : Pat <
30483055 (f32 (sint_to_fp i1:$src)),
30493056 (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src)
30563063
30573064 def : Pat <
30583065 (f64 (sint_to_fp i1:$src)),
3059 (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
3066 (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
30603067 >;
30613068
30623069 def : Pat <
8484 Next = std::next(I);
8585 MachineInstr &MI = *I;
8686
87 if (MI.getOpcode() == AMDGPU::V_MOV_I1) {
88 I1Defs.push_back(MI.getOperand(0).getReg());
89 MI.setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
90 continue;
91 }
92
93 if (MI.getOpcode() == AMDGPU::V_AND_I1) {
94 I1Defs.push_back(MI.getOperand(0).getReg());
95 MI.setDesc(TII->get(AMDGPU::V_AND_B32_e64));
96 continue;
97 }
98
99 if (MI.getOpcode() == AMDGPU::V_OR_I1) {
100 I1Defs.push_back(MI.getOperand(0).getReg());
101 MI.setDesc(TII->get(AMDGPU::V_OR_B32_e64));
102 continue;
103 }
104
105 if (MI.getOpcode() == AMDGPU::V_XOR_I1) {
106 I1Defs.push_back(MI.getOperand(0).getReg());
107 MI.setDesc(TII->get(AMDGPU::V_XOR_B32_e64));
108 continue;
109 }
110
11187 if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
11288 unsigned Reg = MI.getOperand(0).getReg();
11389 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
11692 continue;
11793 }
11894
119 if (MI.getOpcode() != AMDGPU::COPY ||
120 !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) ||
121 !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg()))
95 if (MI.getOpcode() != AMDGPU::COPY)
12296 continue;
12397
98 const MachineOperand &Dst = MI.getOperand(0);
99 const MachineOperand &Src = MI.getOperand(1);
124100
125 const TargetRegisterClass *DstRC =
126 MRI.getRegClass(MI.getOperand(0).getReg());
127 const TargetRegisterClass *SrcRC =
128 MRI.getRegClass(MI.getOperand(1).getReg());
101 if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) ||
102 !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
103 continue;
104
105 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
106 const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
129107
130108 if (DstRC == &AMDGPU::VReg_1RegClass &&
131109 TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
132 I1Defs.push_back(MI.getOperand(0).getReg());
133 BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CNDMASK_B32_e64))
134 .addOperand(MI.getOperand(0))
135 .addImm(0)
136 .addImm(-1)
137 .addOperand(MI.getOperand(1));
110 I1Defs.push_back(Dst.getReg());
111 DebugLoc DL = MI.getDebugLoc();
112
113 MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
114 if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
115 if (DefInst->getOperand(1).isImm()) {
116 I1Defs.push_back(Dst.getReg());
117
118 int64_t Val = DefInst->getOperand(1).getImm();
119 assert(Val == 0 || Val == -1);
120
121 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))
122 .addOperand(Dst)
123 .addImm(Val);
124 MI.eraseFromParent();
125 continue;
126 }
127 }
128
129 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
130 .addOperand(Dst)
131 .addImm(0)
132 .addImm(-1)
133 .addOperand(Src);
138134 MI.eraseFromParent();
139135 } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
140136 SrcRC == &AMDGPU::VReg_1RegClass) {
141137 BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64))
142 .addOperand(MI.getOperand(0))
143 .addOperand(MI.getOperand(1))
144 .addImm(0);
138 .addOperand(Dst)
139 .addOperand(Src)
140 .addImm(0);
145141 MI.eraseFromParent();
146142 }
147143 }
None ; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
1 ; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
0 ; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
1 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
22
33 declare double @llvm.ceil.f64(double) nounwind readnone
44 declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
2121 ; SI: cmp_gt_i32
2222 ; SI: cndmask_b32
2323 ; SI: cndmask_b32
24 ; SI: cmp_gt_f64
25 ; SI: cndmask_b32
26 ; SI: cmp_ne_i32
27 ; SI: cndmask_b32
28 ; SI: cndmask_b32
24 ; SI: v_cmp_o_f64
25 ; SI: v_cmp_neq_f64
26 ; SI: s_and_b64
27 ; SI: v_cmp_gt_f64
28 ; SI: s_and_b64
29 ; SI: v_cndmask_b32
30 ; SI: v_cndmask_b32
2931 ; SI: v_add_f64
32 ; SI: s_endpgm
3033 define void @fceil_f64(double addrspace(1)* %out, double %x) {
3134 %y = call double @llvm.ceil.f64(double %x) nounwind readnone
3235 store double %y, double addrspace(1)* %out
None ; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
1 ; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
0 ; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
1 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
22
33 declare double @llvm.floor.f64(double) nounwind readnone
44 declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
2222 ; SI: cmp_gt_i32
2323 ; SI: cndmask_b32
2424 ; SI: cndmask_b32
25 ; SI: cmp_lt_f64
26 ; SI: cndmask_b32
27 ; SI: cmp_ne_i32
28 ; SI: cndmask_b32
29 ; SI: cndmask_b32
25 ; SI: v_cmp_o_f64
26 ; SI: v_cmp_neq_f64
27 ; SI: s_and_b64
28 ; SI: v_cmp_lt_f64
29 ; SI: s_and_b64
30 ; SI: v_cndmask_b32
31 ; SI: v_cndmask_b32
3032 ; SI: v_add_f64
33 ; SI: s_endpgm
3134 define void @ffloor_f64(double addrspace(1)* %out, double %x) {
3235 %y = call double @llvm.floor.f64(double %x) nounwind readnone
3336 store double %y, double addrspace(1)* %out
9595 ; R600-DAG: SETNE_DX10
9696 ; R600-DAG: AND_INT
9797 ; R600-DAG: SETNE_INT
98 ; SI: v_cmp_o_f32
99 ; SI: v_cmp_neq_f32
100 ; SI: v_cndmask_b32_e64
101 ; SI: v_cndmask_b32_e64
102 ; SI: v_and_b32_e32
98
99 ; SI-DAG: v_cmp_o_f32_e32 vcc
100 ; SI-DAG: v_cmp_neq_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
101 ; SI: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP1]], vcc
102 ; SI: v_cndmask_b32_e64 [[VRESULT:v[0-9]+]], 0, -1, [[AND]]
103 ; SI: buffer_store_dword [[VRESULT]]
103104 define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) {
104105 entry:
105106 %0 = fcmp one float %a, %b
129130 ; R600-DAG: SETE_DX10
130131 ; R600-DAG: OR_INT
131132 ; R600-DAG: SETNE_INT
132 ; SI: v_cmp_u_f32
133 ; SI: v_cmp_eq_f32
134 ; SI: v_cndmask_b32_e64
135 ; SI: v_cndmask_b32_e64
136 ; SI: v_or_b32_e32
133
134 ; SI-DAG: v_cmp_u_f32_e32 vcc
135 ; SI-DAG: v_cmp_eq_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
136 ; SI: s_or_b64 [[OR:s\[[0-9]+:[0-9]+\]]], [[CMP1]], vcc
137 ; SI: v_cndmask_b32_e64 [[VRESULT:v[0-9]+]], 0, -1, [[OR]]
138 ; SI: buffer_store_dword [[VRESULT]]
137139 define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) {
138140 entry:
139141 %0 = fcmp ueq float %a, %b
147149 ; R600: SETE_DX10
148150 ; SI: v_cmp_u_f32
149151 ; SI: v_cmp_gt_f32
150 ; SI: v_cndmask_b32_e64
151 ; SI: v_cndmask_b32_e64
152 ; SI: v_or_b32_e32
152 ; SI: s_or_b64
153 ; SI: v_cndmask_b32
153154 define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) {
154155 entry:
155156 %0 = fcmp ugt float %a, %b
163164 ; R600: SETE_DX10
164165 ; SI: v_cmp_u_f32
165166 ; SI: v_cmp_ge_f32
166 ; SI: v_cndmask_b32_e64
167 ; SI: v_cndmask_b32_e64
168 ; SI: v_or_b32_e32
167 ; SI: s_or_b64
168 ; SI: v_cndmask_b32
169169 define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) {
170170 entry:
171171 %0 = fcmp uge float %a, %b
179179 ; R600: SETE_DX10
180180 ; SI: v_cmp_u_f32
181181 ; SI: v_cmp_lt_f32
182 ; SI: v_cndmask_b32_e64
183 ; SI: v_cndmask_b32_e64
184 ; SI: v_or_b32_e32
182 ; SI: s_or_b64
183 ; SI: v_cndmask_b32
185184 define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) {
186185 entry:
187186 %0 = fcmp ult float %a, %b
195194 ; R600: SETE_DX10
196195 ; SI: v_cmp_u_f32
197196 ; SI: v_cmp_le_f32
198 ; SI: v_cndmask_b32_e64
199 ; SI: v_cndmask_b32_e64
200 ; SI: v_or_b32_e32
197 ; SI: s_or_b64
198 ; SI: v_cndmask_b32
201199 define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) {
202200 entry:
203201 %0 = fcmp ule float %a, %b
5656 }
5757
5858 ; FUNC-LABEL: {{^}}f64_one:
59 ; SI: v_cmp_o_f64
60 ; SI: v_cmp_neq_f64
61 ; SI: v_cndmask_b32_e64
62 ; SI: v_cndmask_b32_e64
63 ; SI: v_and_b32_e32
59 ; SI-DAG: v_cmp_o_f64_e32 vcc
60 ; SI-DAG: v_cmp_neq_f64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
61 ; SI: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP1]], vcc
62 ; SI: v_cndmask_b32_e64 [[VRESULT:v[0-9]+]], 0, -1, [[AND]]
63 ; SI: buffer_store_dword [[VRESULT]]
6464 define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) {
6565 entry:
6666 %0 = fcmp one double %a, %b
8282 ; FUNC-LABEL: {{^}}f64_ueq:
8383 ; SI: v_cmp_u_f64
8484 ; SI: v_cmp_eq_f64
85 ; SI: v_cndmask_b32_e64
86 ; SI: v_cndmask_b32_e64
87 ; SI: v_or_b32_e32
85 ; SI: s_or_b64
86 ; SI: v_cndmask_b32
8887 define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) {
8988 entry:
9089 %0 = fcmp ueq double %a, %b
9695 ; FUNC-LABEL: {{^}}f64_ugt:
9796 ; SI: v_cmp_u_f64
9897 ; SI: v_cmp_gt_f64
99 ; SI: v_cndmask_b32_e64
100 ; SI: v_cndmask_b32_e64
101 ; SI: v_or_b32_e32
98 ; SI: s_or_b64
99 ; SI: v_cndmask_b32
102100 define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) {
103101 entry:
104102 %0 = fcmp ugt double %a, %b
110108 ; FUNC-LABEL: {{^}}f64_uge:
111109 ; SI: v_cmp_u_f64
112110 ; SI: v_cmp_ge_f64
113 ; SI: v_cndmask_b32_e64
114 ; SI: v_cndmask_b32_e64
115 ; SI: v_or_b32_e32
111 ; SI: s_or_b64
112 ; SI: v_cndmask_b32
116113 define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) {
117114 entry:
118115 %0 = fcmp uge double %a, %b
124121 ; FUNC-LABEL: {{^}}f64_ult:
125122 ; SI: v_cmp_u_f64
126123 ; SI: v_cmp_lt_f64
127 ; SI: v_cndmask_b32_e64
128 ; SI: v_cndmask_b32_e64
129 ; SI: v_or_b32_e32
124 ; SI: s_or_b64
125 ; SI: v_cndmask_b32
130126 define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) {
131127 entry:
132128 %0 = fcmp ult double %a, %b
138134 ; FUNC-LABEL: {{^}}f64_ule:
139135 ; SI: v_cmp_u_f64
140136 ; SI: v_cmp_le_f64
141 ; SI: v_cndmask_b32_e64
142 ; SI: v_cndmask_b32_e64
143 ; SI: v_or_b32_e32
137 ; SI: s_or_b64
138 ; SI: v_cndmask_b32
144139 define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) {
145140 entry:
146141 %0 = fcmp ule double %a, %b
5858 ret void
5959 }
6060
61 ; FIXME: Should write to different SGPR pairs instead of copying to
62 ; VALU for i1 phi.
63
64 ; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br:
65 ; SI: buffer_load_dword [[AVAL:v[0-9]+]]
66 ; SI: v_cmp_lt_i32_e64 [[CMP_IF:s\[[0-9]+:[0-9]+\]]], [[AVAL]], 0
67 ; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]
68
69 ; SI: BB2_1:
70 ; SI: buffer_load_dword [[AVAL:v[0-9]+]]
71 ; SI: v_cmp_eq_i32_e64 [[CMP_ELSE:s\[[0-9]+:[0-9]+\]]], [[AVAL]], 0
72 ; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]
73
74 ; SI: v_cmp_ne_i32_e64 [[CMP_CMP:s\[[0-9]+:[0-9]+\]]], [[V_CMP]], 0
75 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
76 ; SI: buffer_store_dword [[RESULT]]
77 define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
78 entry:
79 %tid = call i32 @llvm.r600.read.tidig.x() #0
80 %tmp1 = icmp eq i32 %tid, 0
81 br i1 %tmp1, label %if, label %else
82
83 if:
84 %gep.if = getelementptr i32 addrspace(1)* %a, i32 %tid
85 %a.val = load i32 addrspace(1)* %gep.if
86 %cmp.if = icmp eq i32 %a.val, 0
87 br label %endif
88
89 else:
90 %gep.else = getelementptr i32 addrspace(1)* %b, i32 %tid
91 %b.val = load i32 addrspace(1)* %gep.else
92 %cmp.else = icmp slt i32 %b.val, 0
93 br label %endif
94
95 endif:
96 %tmp4 = phi i1 [%cmp.if, %if], [%cmp.else, %else]
97 %ext = sext i1 %tmp4 to i32
98 store i32 %ext, i32 addrspace(1)* %out
99 ret void
100 }
101
61102 declare i32 @llvm.r600.read.tidig.x() #0
62103
63104 attributes #0 = { readnone }
None ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s
11
2 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
3
4 ; SI-LABEL: @test_if
25 ; Make sure the i1 values created by the cfg structurizer pass are
36 ; moved using VALU instructions
47 ; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
58 ; SI: v_mov_b32_e32 v{{[0-9]}}, -1
6 define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) {
9 define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
710 entry:
811 switch i32 %a, label %default [
912 i32 0, label %case0
3639 end:
3740 ret void
3841 }
42
43 ; SI-LABEL: @simple_test_v_if
44 ; SI: v_cmp_ne_i32_e64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0
45 ; SI: s_and_saveexec_b64 [[BR_SREG]], [[BR_SREG]]
46 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
47
48 ; SI: ; BB#1
49 ; SI: buffer_store_dword
50 ; SI: s_endpgm
51
52 ; SI: BB1_2:
53 ; SI: s_or_b64 exec, exec, [[BR_SREG]]
54 ; SI: s_endpgm
55 define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
56 %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
57 %is.0 = icmp ne i32 %tid, 0
58 br i1 %is.0, label %store, label %exit
59
60 store:
61 %gep = getelementptr i32 addrspace(1)* %dst, i32 %tid
62 store i32 999, i32 addrspace(1)* %gep
63 ret void
64
65 exit:
66 ret void
67 }
68
69 ; SI-LABEL: @simple_test_v_loop
70 ; SI: v_cmp_ne_i32_e64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0
71 ; SI: s_and_saveexec_b64 [[BR_SREG]], [[BR_SREG]]
72 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
73 ; SI: s_cbranch_execz BB2_2
74
75 ; SI: ; BB#1:
76 ; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
77
78 ; SI: BB2_3:
79 ; SI: buffer_load_dword
80 ; SI: buffer_store_dword
81 ; SI: v_cmp_eq_i32_e32 vcc,
82 ; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]]
83 ; SI: v_add_i32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
84 ; SI: s_andn2_b64 exec, exec, [[OR_SREG]]
85 ; SI: s_cbranch_execnz BB2_3
86
87 define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
88 entry:
89 %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
90 %is.0 = icmp ne i32 %tid, 0
91 %limit = add i32 %tid, 64
92 br i1 %is.0, label %loop, label %exit
93
94 loop:
95 %i = phi i32 [%tid, %entry], [%i.inc, %loop]
96 %gep.src = getelementptr i32 addrspace(1)* %src, i32 %i
97 %gep.dst = getelementptr i32 addrspace(1)* %dst, i32 %i
98 %load = load i32 addrspace(1)* %src
99 store i32 %load, i32 addrspace(1)* %gep.dst
100 %i.inc = add nsw i32 %i, 1
101 %cmp = icmp eq i32 %limit, %i.inc
102 br i1 %cmp, label %exit, label %loop
103
104 exit:
105 ret void
106 }
107
108 ; SI-LABEL: @multi_vcond_loop
109
110 ; Load loop limit from buffer
111 ; Branch to exit if uniformly not taken
112 ; SI: ; BB#0:
113 ; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
114 ; SI: v_cmp_gt_i32_e64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]]
115 ; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG]], [[OUTER_CMP_SREG]]
116 ; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]]
117 ; SI: s_cbranch_execz BB3_2
118
119 ; Initialize inner condition to false
120 ; SI: ; BB#1:
121 ; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}}
122 ; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]]
123
124 ; Clear exec bits for workitems that load -1s
125 ; SI: BB3_3:
126 ; SI: buffer_load_dword [[A:v[0-9]+]]
127 ; SI: buffer_load_dword [[B:v[0-9]+]]
128 ; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], [[A]], -1
129 ; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_1:s\[[0-9]+:[0-9]+\]]], [[B]], -1
130 ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
131 ; SI: s_and_saveexec_b64 [[ORNEG1]], [[ORNEG1]]
132 ; SI: s_xor_b64 [[ORNEG1]], exec, [[ORNEG1]]
133 ; SI: s_cbranch_execz BB3_5
134
135 ; SI: BB#4:
136 ; SI: buffer_store_dword
137 ; SI: v_cmp_ge_i64_e32 vcc
138 ; SI: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]]
139
140 ; SI: BB3_5:
141 ; SI: s_or_b64 exec, exec, [[ORNEG1]]
142 ; SI: s_or_b64 [[COND_STATE]], [[ORNEG1]], [[COND_STATE]]
143 ; SI: s_andn2_b64 exec, exec, [[COND_STATE]]
144 ; SI: s_cbranch_execnz BB3_3
145
146 ; SI: BB#6
147 ; SI: s_or_b64 exec, exec, [[COND_STATE]]
148
149 ; SI: BB3_2:
150 ; SI-NOT: [[COND_STATE]]
151 ; SI: s_endpgm
152
153 define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
154 bb:
155 %tmp = tail call i32 @llvm.r600.read.tidig.x() #0
156 %tmp4 = sext i32 %tmp to i64
157 %tmp5 = getelementptr inbounds i32 addrspace(1)* %arg3, i64 %tmp4
158 %tmp6 = load i32 addrspace(1)* %tmp5, align 4
159 %tmp7 = icmp sgt i32 %tmp6, 0
160 %tmp8 = sext i32 %tmp6 to i64
161 br i1 %tmp7, label %bb10, label %bb26
162
163 bb10: ; preds = %bb, %bb20
164 %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ]
165 %tmp12 = add nsw i64 %tmp11, %tmp4
166 %tmp13 = getelementptr inbounds i32 addrspace(1)* %arg1, i64 %tmp12
167 %tmp14 = load i32 addrspace(1)* %tmp13, align 4
168 %tmp15 = getelementptr inbounds i32 addrspace(1)* %arg2, i64 %tmp12
169 %tmp16 = load i32 addrspace(1)* %tmp15, align 4
170 %tmp17 = icmp ne i32 %tmp14, -1
171 %tmp18 = icmp ne i32 %tmp16, -1
172 %tmp19 = and i1 %tmp17, %tmp18
173 br i1 %tmp19, label %bb20, label %bb26
174
175 bb20: ; preds = %bb10
176 %tmp21 = add nsw i32 %tmp16, %tmp14
177 %tmp22 = getelementptr inbounds i32 addrspace(1)* %arg, i64 %tmp12
178 store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4
179 %tmp23 = add nuw nsw i64 %tmp11, 1
180 %tmp24 = icmp slt i64 %tmp23, %tmp8
181 br i1 %tmp24, label %bb10, label %bb26
182
183 bb26: ; preds = %bb10, %bb20, %bb
184 ret void
185 }
186
187 attributes #0 = { nounwind readnone }
188 attributes #1 = { nounwind }
3838 ; FUNC-LABEL: {{^}}xor_i1:
3939 ; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
4040
41 ; SI: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
42
41 ; SI-DAG: v_cmp_ge_f32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, 0.0
42 ; SI-DAG: v_cmp_ge_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, 1.0
43 ; SI: s_xor_b64 [[XOR:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]]
44 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[XOR]]
45 ; SI: buffer_store_dword [[RESULT]]
46 ; SI: s_endpgm
4347 define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
4448 %a = load float addrspace(1) * %in0
4549 %b = load float addrspace(1) * %in1
4650 %acmp = fcmp oge float %a, 0.000000e+00
47 %bcmp = fcmp oge float %b, 0.000000e+00
51 %bcmp = fcmp oge float %b, 1.000000e+00
4852 %xor = xor i1 %acmp, %bcmp
4953 %result = select i1 %xor, float %a, float %b
5054 store float %result, float addrspace(1)* %out
55 ret void
56 }
57
58 ; FUNC-LABEL: {{^}}v_xor_i1:
59 ; SI: buffer_load_ubyte [[A:v[0-9]+]]
60 ; SI: buffer_load_ubyte [[B:v[0-9]+]]
61 ; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[B]], [[A]]
62 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
63 ; SI: buffer_store_byte [[RESULT]]
64 define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
65 %a = load i1 addrspace(1)* %in0
66 %b = load i1 addrspace(1)* %in1
67 %xor = xor i1 %a, %b
68 store i1 %xor, i1 addrspace(1)* %out
5169 ret void
5270 }
5371