llvm.org GIT mirror llvm / c18c17d
[AMDGPU] gfx1010 core wave32 changes Differential Revision: https://reviews.llvm.org/D63204 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363934 91177308-0d34-0410-b5e6-96231b3b80d8 Stanislav Mekhanoshin 28 days ago
32 changed file(s) with 1933 addition(s) and 60 deletion(s). Raw diff Collapse all Expand all
776776 FeatureLDSBankCount32,
777777 FeatureDLInsts,
778778 FeatureNSAEncoding,
779 FeatureWavefrontSize64,
779 FeatureWavefrontSize32,
780780 FeatureScalarStores,
781781 FeatureScalarAtomics,
782782 FeatureScalarFlatScratchInsts,
794794 FeatureDot5Insts,
795795 FeatureDot6Insts,
796796 FeatureNSAEncoding,
797 FeatureWavefrontSize64,
797 FeatureWavefrontSize32,
798798 FeatureScalarStores,
799799 FeatureScalarAtomics,
800800 FeatureScalarFlatScratchInsts,
811811 FeatureDot5Insts,
812812 FeatureDot6Insts,
813813 FeatureNSAEncoding,
814 FeatureWavefrontSize64,
814 FeatureWavefrontSize32,
815815 FeatureScalarStores,
816816 FeatureScalarAtomics,
817817 FeatureScalarFlatScratchInsts,
4949 def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
5050
5151 def AMDGPUIfOp : SDTypeProfile<1, 2,
52 [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
52 [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
5353 >;
5454
5555 def AMDGPUElseOp : SDTypeProfile<1, 2,
56 [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, OtherVT>]
56 [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
5757 >;
5858
5959 def AMDGPULoopOp : SDTypeProfile<0, 2,
60 [SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>]
60 [SDTCisVT<0, i1>, SDTCisVT<1, OtherVT>]
6161 >;
6262
6363 def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
64 [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>]
64 [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, i1>]
6565 >;
6666
6767 //===----------------------------------------------------------------------===//
100100 return addOperand(Inst, MCOperand::createImm(Imm));
101101 }
102102
103 static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val,
104 uint64_t Addr, const void *Decoder) {
105 auto DAsm = static_cast(Decoder);
106 return addOperand(Inst, DAsm->decodeBoolReg(Val));
107 }
108
103109 #define DECODE_OPERAND(StaticDecoderName, DecoderName) \
104110 static DecodeStatus StaticDecoderName(MCInst &Inst, \
105111 unsigned Imm, \
945945 /// not exist. If Opcode is not a pseudo instruction, this is identity.
946946 int pseudoToMCOpcode(int Opcode) const;
947947
948 const TargetRegisterClass *getRegClass(const MCInstrDesc &TID, unsigned OpNum,
949 const TargetRegisterInfo *TRI,
950 const MachineFunction &MF)
951 const override {
952 if (OpNum >= TID.getNumOperands())
953 return nullptr;
954 return RI.getRegClass(TID.OpInfo[OpNum].RegClass);
955 }
956
948957 void fixImplicitOperands(MachineInstr &MI) const;
949958 };
950959
765765 let PrintMethod = "printVOPDst";
766766 }
767767
768 // SCSrc_i1 is the operand for pseudo instructions only.
769 // Boolean immeadiates shall not be exposed to codegen instructions.
770 def SCSrc_i1 : RegisterOperand {
771 let OperandNamespace = "AMDGPU";
772 let OperandType = "OPERAND_REG_IMM_INT32";
773 let ParserMatchClass = BoolReg;
774 let DecoderMethod = "decodeBoolReg";
775 }
776
768777 // ===----------------------------------------------------------------------===//
769778 // ExpSrc* Special cases for exp src operands which are printed as
770779 // "off" depending on en operand.
803812 def SDWASrc_f32 : SDWASrc;
804813 def SDWASrc_f16 : SDWASrc;
805814
806 def SDWAVopcDst : VOPDstOperand {
815 def SDWAVopcDst : BoolRC {
807816 let OperandNamespace = "AMDGPU";
808817 let OperandType = "OPERAND_SDWA_VOPC_DST";
809818 let EncoderMethod = "getSDWAVopcDstEncoding";
810819 let DecoderMethod = "decodeSDWAVopcDst";
820 let PrintMethod = "printVOPDst";
811821 }
812822
813823 class NamedMatchClass : AsmOperandClass {
938948 // constant bus.
939949 def KImmFP16MatchClass : KImmMatchClass<16>;
940950 def f16kimm : kimmOperand;
941
942
943 def VOPDstS64 : VOPDstOperand {
944 let PrintMethod = "printVOPDst";
945 }
946951
947952 class FPInputModsMatchClass : AsmOperandClass {
948953 let Name = "RegOrImmWithFP"#opSize#"InputMods";
12361241 !if(!eq(VT.Size, 128), VOPDstOperand,
12371242 !if(!eq(VT.Size, 64), VOPDstOperand,
12381243 !if(!eq(VT.Size, 16), VOPDstOperand,
1239 VOPDstOperand)))); // else VT == i1
1244 VOPDstS64orS32)))); // else VT == i1
12401245 }
12411246
12421247 // Returns the register class to use for the destination of VOP[12C]
13121317 VSrc_f64,
13131318 VSrc_b64),
13141319 !if(!eq(VT.Value, i1.Value),
1315 SCSrc_i1,
1320 SSrc_i1,
13161321 !if(isFP,
13171322 !if(!eq(VT.Value, f16.Value),
13181323 VSrc_f16,
120120
121121 } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
122122
123 def ENTER_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins i64imm:$src0)> {
123 def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
124124 let Defs = [EXEC];
125125 let hasSideEffects = 0;
126126 let mayLoad = 0;
127127 let mayStore = 0;
128128 }
129129
130 def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> {
130 def EXIT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
131131 let hasSideEffects = 0;
132132 let mayLoad = 0;
133133 let mayStore = 0;
160160 >;
161161
162162 def S_ADD_U64_CO_PSEUDO : SPseudoInstSI <
163 (outs SReg_64:$vdst, VOPDstS64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
163 (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
164164 >;
165165
166166 def S_SUB_U64_CO_PSEUDO : SPseudoInstSI <
167 (outs SReg_64:$vdst, VOPDstS64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
167 (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
168168 >;
169169 } // End usesCustomInserter = 1, Defs = [SCC]
170170
232232 let OtherPredicates = [EnableLateCFGStructurize] in {
233233 def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
234234 (outs),
235 (ins SReg_64:$vcc, brtarget:$target),
235 (ins SReg_1:$vcc, brtarget:$target),
236236 [(brcond i1:$vcc, bb:$target)]> {
237237 let Size = 12;
238238 }
239239 }
240240
241241 def SI_IF: CFPseudoInstSI <
242 (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
243 [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
242 (outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target),
243 [(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
244244 let Constraints = "";
245245 let Size = 12;
246246 let hasSideEffects = 1;
247247 }
248248
249249 def SI_ELSE : CFPseudoInstSI <
250 (outs SReg_64:$dst),
251 (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
250 (outs SReg_1:$dst),
251 (ins SReg_1:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
252252 let Size = 12;
253253 let hasSideEffects = 1;
254254 }
255255
256256 def SI_LOOP : CFPseudoInstSI <
257 (outs), (ins SReg_64:$saved, brtarget:$target),
258 [(AMDGPUloop i64:$saved, bb:$target)], 1, 1> {
257 (outs), (ins SReg_1:$saved, brtarget:$target),
258 [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> {
259259 let Size = 8;
260260 let isBranch = 1;
261261 let hasSideEffects = 1;
264264 } // End isTerminator = 1
265265
266266 def SI_END_CF : CFPseudoInstSI <
267 (outs), (ins SReg_64:$saved),
268 [(int_amdgcn_end_cf i64:$saved)], 1, 1> {
267 (outs), (ins SReg_1:$saved), [], 1, 1> {
269268 let Size = 4;
270269 let isAsCheapAsAMove = 1;
271270 let isReMaterializable = 1;
275274 }
276275
277276 def SI_IF_BREAK : CFPseudoInstSI <
278 (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
279 [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> {
277 (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> {
280278 let Size = 4;
281279 let isAsCheapAsAMove = 1;
282280 let isReMaterializable = 1;
302300 }
303301 }
304302
305 defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>;
303 defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
306304 defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
307305
308306 let Defs = [EXEC,VCC] in
321319 }
322320
323321 def SI_PS_LIVE : PseudoInstSI <
324 (outs SReg_64:$dst), (ins),
322 (outs SReg_1:$dst), (ins),
325323 [(set i1:$dst, (int_amdgcn_ps_live))]> {
326324 let SALU = 1;
327325 }
583581 >;
584582
585583 def : GCNPat<
586 (AMDGPUelse i64:$src, bb:$target),
584 (AMDGPUelse i1:$src, bb:$target),
587585 (SI_ELSE $src, $target, 0)
588586 >;
589587
732732
733733 defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ;
734734
735 def SCSrc_i1 : RegisterOperand;
736
737735 //===----------------------------------------------------------------------===//
738736 // VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
739737 //===----------------------------------------------------------------------===//
343343 let AsmDPP8 = "$vdst, vcc, $src0, $src1 $dpp8$fi";
344344 let AsmDPP16 = AsmDPP#"$fi";
345345 let Outs32 = (outs DstRC:$vdst);
346 let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
346 let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
347347 }
348348
349349 // Write out to vcc or arbitrary SGPR and read in from vcc or
357357 let AsmDPP8 = "$vdst, vcc, $src0, $src1, vcc $dpp8$fi";
358358 let AsmDPP16 = AsmDPP#"$fi";
359359 let Outs32 = (outs DstRC:$vdst);
360 let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
360 let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
361361
362362 // Suppress src2 implied by type since the 32-bit encoding uses an
363363 // implicit VCC use.
182182 let HasModifiers = 0;
183183 let HasClamp = 0;
184184 let HasOMod = 0;
185 let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
185 let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
186186 let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
187187 }
188188
202202 // FIXME: Hack to stop printing _e64
203203 let DstRC = RegisterOperand;
204204
205 let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
205 let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
206206 let Asm64 = " $vdst, $sdst, $src0, $src1, $src2$clamp";
207207 }
208208
5555 let Asm32 = "$src0, $src1";
5656 // The destination for 32-bit encoding is implicit.
5757 let HasDst32 = 0;
58 let Outs64 = (outs VOPDstS64:$sdst);
58 let Outs64 = (outs VOPDstS64orS32:$sdst);
5959 list Schedule = sched;
6060 }
6161
2121 ; GFX10-LABEL: add3:
2222 ; GFX10: ; %bb.0:
2323 ; GFX10-NEXT: v_add3_u32 v0, v0, v1, v2
24 ; GFX10-NEXT: ; implicit-def: $vcc_hi
2425 ; GFX10-NEXT: ; return to shader part epilog
2526 %x = add i32 %a, %b
2627 %result = add i32 %x, %c
4546 ; GFX10-LABEL: mad_no_add3:
4647 ; GFX10: ; %bb.0:
4748 ; GFX10-NEXT: v_mad_u32_u24 v0, v0, v1, v4
49 ; GFX10-NEXT: ; implicit-def: $vcc_hi
4850 ; GFX10-NEXT: v_mad_u32_u24 v0, v2, v3, v0
4951 ; GFX10-NEXT: ; return to shader part epilog
5052 %a0 = shl i32 %a, 8
8486 ; GFX10-LABEL: add3_vgpr_b:
8587 ; GFX10: ; %bb.0:
8688 ; GFX10-NEXT: v_add3_u32 v0, s3, s2, v0
89 ; GFX10-NEXT: ; implicit-def: $vcc_hi
8790 ; GFX10-NEXT: ; return to shader part epilog
8891 %x = add i32 %a, %b
8992 %result = add i32 %x, %c
106109 ; GFX10-LABEL: add3_vgpr_all2:
107110 ; GFX10: ; %bb.0:
108111 ; GFX10-NEXT: v_add3_u32 v0, v1, v2, v0
112 ; GFX10-NEXT: ; implicit-def: $vcc_hi
109113 ; GFX10-NEXT: ; return to shader part epilog
110114 %x = add i32 %b, %c
111115 %result = add i32 %a, %x
128132 ; GFX10-LABEL: add3_vgpr_bc:
129133 ; GFX10: ; %bb.0:
130134 ; GFX10-NEXT: v_add3_u32 v0, s2, v0, v1
135 ; GFX10-NEXT: ; implicit-def: $vcc_hi
131136 ; GFX10-NEXT: ; return to shader part epilog
132137 %x = add i32 %a, %b
133138 %result = add i32 %x, %c
150155 ; GFX10-LABEL: add3_vgpr_const:
151156 ; GFX10: ; %bb.0:
152157 ; GFX10-NEXT: v_add3_u32 v0, v0, v1, 16
158 ; GFX10-NEXT: ; implicit-def: $vcc_hi
153159 ; GFX10-NEXT: ; return to shader part epilog
154160 %x = add i32 %a, %b
155161 %result = add i32 %x, 16
174180 ; GFX10-LABEL: add3_multiuse_outer:
175181 ; GFX10: ; %bb.0:
176182 ; GFX10-NEXT: v_add3_u32 v0, v0, v1, v2
183 ; GFX10-NEXT: ; implicit-def: $vcc_hi
177184 ; GFX10-NEXT: v_mul_lo_u32 v1, v0, v3
178185 ; GFX10-NEXT: ; return to shader part epilog
179186 %inner = add i32 %a, %b
201208 ; GFX10-LABEL: add3_multiuse_inner:
202209 ; GFX10: ; %bb.0:
203210 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1
211 ; GFX10-NEXT: ; implicit-def: $vcc_hi
204212 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v0, v2
205213 ; GFX10-NEXT: ; return to shader part epilog
206214 %inner = add i32 %a, %b
239247 ; GFX10-NEXT: v_add_f32_e64 v1, s3, 2.0
240248 ; GFX10-NEXT: v_add_f32_e64 v2, s2, 1.0
241249 ; GFX10-NEXT: v_add_f32_e64 v0, 0x40400000, s4
250 ; GFX10-NEXT: ; implicit-def: $vcc_hi
242251 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1
243252 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v1, v0
244253 ; GFX10-NEXT: ; return to shader part epilog
None ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
1 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
12
23
34 ; GCN-LABEL: {{^}}add_var_var_i1:
4 ; GCN: s_xor_b64
5 ; GFX9: s_xor_b64
6 ; GFX10: s_xor_b32
57 define amdgpu_kernel void @add_var_var_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
68 %a = load volatile i1, i1 addrspace(1)* %in0
79 %b = load volatile i1, i1 addrspace(1)* %in1
1113 }
1214
1315 ; GCN-LABEL: {{^}}add_var_imm_i1:
14 ; GCN: s_not_b64
16 ; GFX9: s_not_b64
17 ; GFX10: s_not_b32
1518 define amdgpu_kernel void @add_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) {
1619 %a = load volatile i1, i1 addrspace(1)* %in
1720 %add = add i1 %a, 1
2124
2225 ; GCN-LABEL: {{^}}add_i1_cf:
2326 ; GCN: ; %endif
24 ; GCN: s_not_b64
27 ; GFX9: s_not_b64
28 ; GFX10: s_not_b32
2529 define amdgpu_kernel void @add_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
2630 entry:
2731 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2121 ; GFX10-LABEL: add_shl:
2222 ; GFX10: ; %bb.0:
2323 ; GFX10-NEXT: v_add_lshl_u32 v0, v0, v1, v2
24 ; GFX10-NEXT: ; implicit-def: $vcc_hi
2425 ; GFX10-NEXT: ; return to shader part epilog
2526 %x = add i32 %a, %b
2627 %result = shl i32 %x, %c
4445 ; GFX10-LABEL: add_shl_vgpr_c:
4546 ; GFX10: ; %bb.0:
4647 ; GFX10-NEXT: v_add_lshl_u32 v0, s2, s3, v0
48 ; GFX10-NEXT: ; implicit-def: $vcc_hi
4749 ; GFX10-NEXT: ; return to shader part epilog
4850 %x = add i32 %a, %b
4951 %result = shl i32 %x, %c
6668 ; GFX10-LABEL: add_shl_vgpr_ac:
6769 ; GFX10: ; %bb.0:
6870 ; GFX10-NEXT: v_add_lshl_u32 v0, v0, s2, v1
71 ; GFX10-NEXT: ; implicit-def: $vcc_hi
6972 ; GFX10-NEXT: ; return to shader part epilog
7073 %x = add i32 %a, %b
7174 %result = shl i32 %x, %c
8891 ; GFX10-LABEL: add_shl_vgpr_const:
8992 ; GFX10: ; %bb.0:
9093 ; GFX10-NEXT: v_add_lshl_u32 v0, v0, v1, 9
94 ; GFX10-NEXT: ; implicit-def: $vcc_hi
9195 ; GFX10-NEXT: ; return to shader part epilog
9296 %x = add i32 %a, %b
9397 %result = shl i32 %x, 9
111115 ; GFX10-LABEL: add_shl_vgpr_const_inline_const:
112116 ; GFX10: ; %bb.0:
113117 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 9, 0x7e800
118 ; GFX10-NEXT: ; implicit-def: $vcc_hi
114119 ; GFX10-NEXT: ; return to shader part epilog
115120 %x = add i32 %a, 1012
116121 %result = shl i32 %x, 9
137142 ; GFX10-LABEL: add_shl_vgpr_inline_const_x2:
138143 ; GFX10: ; %bb.0:
139144 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 9, 0x600
145 ; GFX10-NEXT: ; implicit-def: $vcc_hi
140146 ; GFX10-NEXT: ; return to shader part epilog
141147 %x = add i32 %a, 3
142148 %result = shl i32 %x, 9
2121 ; GFX10-LABEL: and_or:
2222 ; GFX10: ; %bb.0:
2323 ; GFX10-NEXT: v_and_or_b32 v0, v0, v1, v2
24 ; GFX10-NEXT: ; implicit-def: $vcc_hi
2425 ; GFX10-NEXT: ; return to shader part epilog
2526 %x = and i32 %a, %b
2627 %result = or i32 %x, %c
4546 ; GFX10-LABEL: and_or_vgpr_b:
4647 ; GFX10: ; %bb.0:
4748 ; GFX10-NEXT: v_and_or_b32 v0, s2, v0, s3
49 ; GFX10-NEXT: ; implicit-def: $vcc_hi
4850 ; GFX10-NEXT: ; return to shader part epilog
4951 %x = and i32 %a, %b
5052 %result = or i32 %x, %c
6769 ; GFX10-LABEL: and_or_vgpr_ab:
6870 ; GFX10: ; %bb.0:
6971 ; GFX10-NEXT: v_and_or_b32 v0, v0, v1, s2
72 ; GFX10-NEXT: ; implicit-def: $vcc_hi
7073 ; GFX10-NEXT: ; return to shader part epilog
7174 %x = and i32 %a, %b
7275 %result = or i32 %x, %c
8992 ; GFX10-LABEL: and_or_vgpr_const:
9093 ; GFX10: ; %bb.0:
9194 ; GFX10-NEXT: v_and_or_b32 v0, v0, 4, v1
95 ; GFX10-NEXT: ; implicit-def: $vcc_hi
9296 ; GFX10-NEXT: ; return to shader part epilog
9397 %x = and i32 4, %a
9498 %result = or i32 %x, %b
112116 ; GFX10-LABEL: and_or_vgpr_const_inline_const:
113117 ; GFX10: ; %bb.0:
114118 ; GFX10-NEXT: v_and_or_b32 v0, v0, 20, 0x808
119 ; GFX10-NEXT: ; implicit-def: $vcc_hi
115120 ; GFX10-NEXT: ; return to shader part epilog
116121 %x = and i32 20, %a
117122 %result = or i32 %x, 2056
134139 ; GFX10-LABEL: and_or_vgpr_inline_const_x2:
135140 ; GFX10: ; %bb.0:
136141 ; GFX10-NEXT: v_and_or_b32 v0, v0, 4, 1
142 ; GFX10-NEXT: ; implicit-def: $vcc_hi
137143 ; GFX10-NEXT: ; return to shader part epilog
138144 %x = and i32 4, %a
139145 %result = or i32 %x, 1
0 ; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=WAVE64 --check-prefix=NOTES %s
11 ; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=WAVE64 --check-prefix=NOTES %s
22 ; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=WAVE64 --check-prefix=NOTES %s
3 ; run: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX1010 --check-prefix=WAVE32 --check-prefix=NOTES %s
3 ; RUN: llc -mattr=+code-object-v3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX1010 --check-prefix=WAVE32 --check-prefix=NOTES %s
44
55 @var = addrspace(1) global float 0.0
66
None ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
0 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,WAVE64 %s
1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,WAVE32 %s
2
3 ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo14:
4 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
5 ; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x3ffc, [[FI]]
6 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
7 define amdgpu_kernel void @scratch_buffer_known_high_masklo14() #0 {
8 %alloca = alloca i32, align 4, addrspace(5)
9 store volatile i32 0, i32 addrspace(5)* %alloca
10 %toint = ptrtoint i32 addrspace(5)* %alloca to i32
11 %masked = and i32 %toint, 16383
12 store volatile i32 %masked, i32 addrspace(1)* undef
13 ret void
14 }
115
216 ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo16:
317 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
418 ; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xfffc, [[FI]]
5 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
19 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
620 define amdgpu_kernel void @scratch_buffer_known_high_masklo16() #0 {
721 %alloca = alloca i32, align 4, addrspace(5)
822 store volatile i32 0, i32 addrspace(5)* %alloca
1428
1529 ; GCN-LABEL: {{^}}scratch_buffer_known_high_masklo17:
1630 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
17 ; GCN-NOT: [[FI]]
18 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
31 ; WAVE64-NOT: [[FI]]
32 ; WAVE64: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
33
34 ; WAVE32: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1fffc, [[FI]]
35 ; WAVE32: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
1936 define amdgpu_kernel void @scratch_buffer_known_high_masklo17() #0 {
2037 %alloca = alloca i32, align 4, addrspace(5)
2138 store volatile i32 0, i32 addrspace(5)* %alloca
2845 ; GCN-LABEL: {{^}}scratch_buffer_known_high_mask18:
2946 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
3047 ; GCN-NOT: [[FI]]
31 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
48 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
3249 define amdgpu_kernel void @scratch_buffer_known_high_mask18() #0 {
3350 %alloca = alloca i32, align 4, addrspace(5)
3451 store volatile i32 0, i32 addrspace(5)* %alloca
0 # RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-skips -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
1 # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass si-insert-skips -verify-machineinstrs -o - %s | FileCheck -check-prefix=W32 %s
12
23 ---
34 # GCN-LABEL: name: and_execz_mov_vccz
317318 S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
318319 S_ENDPGM 0, implicit $scc
319320 ...
321 ---
322 # W32-LABEL: name: and_execz_mov_vccz_w32
323 # W32-NOT: S_MOV_
324 # W32-NOT: S_AND_
325 # W32: S_CBRANCH_EXECZ %bb.1, implicit $exec
326 name: and_execz_mov_vccz_w32
327 body: |
328 bb.0:
329 S_NOP 0
330
331 bb.1:
332 S_NOP 0
333
334 bb.2:
335 $sgpr0 = S_MOV_B32 -1
336 $vcc_lo = S_AND_B32 $exec_lo, killed $sgpr0, implicit-def dead $scc
337 S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
338 S_ENDPGM 0
339 ...
None ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefix=SI --check-prefix=ALL %s
1 ; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefix=CI --check-prefix=ALL %s
0 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=SI,SICI,ALL %s
1 ; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=CI,SICI,ALL %s
2 ; RUN: opt -S -mcpu=gfx1010 -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefixes=GFX10,ALL %s
23
34 ; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
45 ; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
4546 ret void
4647 }
4748
48 ; ALL: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
49 ; SICI: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
50 ; GFX10: alloca [5 x i32]
4951
5052 define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
5153 entry:
140142 }
141143
142144 ; ALL-LABEL: @occupancy_6_over(
143 ; ALL: alloca [43 x i8]
145 ; SICI: alloca [43 x i8]
146 ; GFX10-NOT: alloca
147
144148 define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
145149 entry:
146150 %stack = alloca [43 x i8], align 4
190194 }
191195
192196 ; ALL-LABEL: @occupancy_8_over(
193 ; ALL: alloca [33 x i8]
197 ; SICI: alloca [33 x i8]
198 ; GFX10-NOT: alloca
199
194200 define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
195201 entry:
196202 %stack = alloca [33 x i8], align 4
240246 }
241247
242248 ; ALL-LABEL: @occupancy_9_over(
243 ; ALL: alloca [29 x i8]
249 ; SICI: alloca [29 x i8]
250 ; GFX10-NOT: alloca
251
244252 define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
245253 entry:
246254 %stack = alloca [29 x i8], align 4
0 # RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,ADDR64
11 # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
22 # RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
3 # RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W32
34
45 # Test that we correctly legalize VGPR Rsrc operands in MUBUF instructions.
56 #
2121 ; GFX10-LABEL: or3:
2222 ; GFX10: ; %bb.0:
2323 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2
24 ; GFX10-NEXT: ; implicit-def: $vcc_hi
2425 ; GFX10-NEXT: ; return to shader part epilog
2526 %x = or i32 %a, %b
2627 %result = or i32 %x, %c
4647 ; GFX10-LABEL: or3_vgpr_a:
4748 ; GFX10: ; %bb.0:
4849 ; GFX10-NEXT: v_or3_b32 v0, v0, s2, s3
50 ; GFX10-NEXT: ; implicit-def: $vcc_hi
4951 ; GFX10-NEXT: ; return to shader part epilog
5052 %x = or i32 %a, %b
5153 %result = or i32 %x, %c
6870 ; GFX10-LABEL: or3_vgpr_all2:
6971 ; GFX10: ; %bb.0:
7072 ; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0
73 ; GFX10-NEXT: ; implicit-def: $vcc_hi
7174 ; GFX10-NEXT: ; return to shader part epilog
7275 %x = or i32 %b, %c
7376 %result = or i32 %a, %x
9093 ; GFX10-LABEL: or3_vgpr_bc:
9194 ; GFX10: ; %bb.0:
9295 ; GFX10-NEXT: v_or3_b32 v0, s2, v0, v1
96 ; GFX10-NEXT: ; implicit-def: $vcc_hi
9397 ; GFX10-NEXT: ; return to shader part epilog
9498 %x = or i32 %a, %b
9599 %result = or i32 %x, %c
112116 ; GFX10-LABEL: or3_vgpr_const:
113117 ; GFX10: ; %bb.0:
114118 ; GFX10-NEXT: v_or3_b32 v0, v1, v0, 64
119 ; GFX10-NEXT: ; implicit-def: $vcc_hi
115120 ; GFX10-NEXT: ; return to shader part epilog
116121 %x = or i32 64, %b
117122 %result = or i32 %x, %a
4545 %0 = IMPLICIT_DEF
4646 %1 = IMPLICIT_DEF
4747 GLOBAL_STORE_DWORDX2 %1, %0, 0, 0, 0, 0, implicit $exec
48 S_ENDPGM 0
49 ...
50
51 # GCN-LABEL: s11_vs_vcc{{$}}
52 # GCN: $vgpr0, $vcc_lo = V_ADDC_U32_e64 killed $sgpr14, killed $vgpr0, killed $vcc_lo, 0
53 ---
54 name: s11_vs_vcc
55 tracksRegLiveness: true
56 registers:
57 - { id: 0, class: sgpr_32, preferred-register: '$sgpr11' }
58 - { id: 1, class: vgpr_32 }
59 - { id: 2, class: vgpr_32 }
60 body: |
61 bb.0:
62 %0 = IMPLICIT_DEF
63 %1 = IMPLICIT_DEF
64 $vcc_lo = IMPLICIT_DEF
65 %2, $vcc_lo = V_ADDC_U32_e64 killed %0, killed %1, killed $vcc_lo, 0, implicit $exec
4866 S_ENDPGM 0
4967 ...
5068
2121 ; GFX10-LABEL: shl_add:
2222 ; GFX10: ; %bb.0:
2323 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, v1, v2
24 ; GFX10-NEXT: ; implicit-def: $vcc_hi
2425 ; GFX10-NEXT: ; return to shader part epilog
2526 %x = shl i32 %a, %b
2627 %result = add i32 %x, %c
4546 ; GFX10-LABEL: shl_add_vgpr_a:
4647 ; GFX10: ; %bb.0:
4748 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, s2, s3
49 ; GFX10-NEXT: ; implicit-def: $vcc_hi
4850 ; GFX10-NEXT: ; return to shader part epilog
4951 %x = shl i32 %a, %b
5052 %result = add i32 %x, %c
6769 ; GFX10-LABEL: shl_add_vgpr_all:
6870 ; GFX10: ; %bb.0:
6971 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, v1, v2
72 ; GFX10-NEXT: ; implicit-def: $vcc_hi
7073 ; GFX10-NEXT: ; return to shader part epilog
7174 %x = shl i32 %a, %b
7275 %result = add i32 %x, %c
8992 ; GFX10-LABEL: shl_add_vgpr_ab:
9093 ; GFX10: ; %bb.0:
9194 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, v1, s2
95 ; GFX10-NEXT: ; implicit-def: $vcc_hi
9296 ; GFX10-NEXT: ; return to shader part epilog
9397 %x = shl i32 %a, %b
9498 %result = add i32 %x, %c
111115 ; GFX10-LABEL: shl_add_vgpr_const:
112116 ; GFX10: ; %bb.0:
113117 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 3, v1
118 ; GFX10-NEXT: ; implicit-def: $vcc_hi
114119 ; GFX10-NEXT: ; return to shader part epilog
115120 %x = shl i32 %a, 3
116121 %result = add i32 %x, %b
2121 ; GFX10-LABEL: shl_or:
2222 ; GFX10: ; %bb.0:
2323 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v1, v2
24 ; GFX10-NEXT: ; implicit-def: $vcc_hi
2425 ; GFX10-NEXT: ; return to shader part epilog
2526 %x = shl i32 %a, %b
2627 %result = or i32 %x, %c
4445 ; GFX10-LABEL: shl_or_vgpr_c:
4546 ; GFX10: ; %bb.0:
4647 ; GFX10-NEXT: v_lshl_or_b32 v0, s2, s3, v0
48 ; GFX10-NEXT: ; implicit-def: $vcc_hi
4749 ; GFX10-NEXT: ; return to shader part epilog
4850 %x = shl i32 %a, %b
4951 %result = or i32 %x, %c
6668 ; GFX10-LABEL: shl_or_vgpr_all2:
6769 ; GFX10: ; %bb.0:
6870 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v1, v2
71 ; GFX10-NEXT: ; implicit-def: $vcc_hi
6972 ; GFX10-NEXT: ; return to shader part epilog
7073 %x = shl i32 %a, %b
7174 %result = or i32 %c, %x
8891 ; GFX10-LABEL: shl_or_vgpr_ac:
8992 ; GFX10: ; %bb.0:
9093 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, s2, v1
94 ; GFX10-NEXT: ; implicit-def: $vcc_hi
9195 ; GFX10-NEXT: ; return to shader part epilog
9296 %x = shl i32 %a, %b
9397 %result = or i32 %x, %c
110114 ; GFX10-LABEL: shl_or_vgpr_const:
111115 ; GFX10: ; %bb.0:
112116 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v1, 6
117 ; GFX10-NEXT: ; implicit-def: $vcc_hi
113118 ; GFX10-NEXT: ; return to shader part epilog
114119 %x = shl i32 %a, %b
115120 %result = or i32 %x, 6
132137 ; GFX10-LABEL: shl_or_vgpr_const2:
133138 ; GFX10: ; %bb.0:
134139 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 6, v1
140 ; GFX10-NEXT: ; implicit-def: $vcc_hi
135141 ; GFX10-NEXT: ; return to shader part epilog
136142 %x = shl i32 %a, 6
137143 %result = or i32 %x, %b
154160 ; GFX10-LABEL: shl_or_vgpr_const_scalar1:
155161 ; GFX10: ; %bb.0:
156162 ; GFX10-NEXT: v_lshl_or_b32 v0, s2, 6, v0
163 ; GFX10-NEXT: ; implicit-def: $vcc_hi
157164 ; GFX10-NEXT: ; return to shader part epilog
158165 %x = shl i32 %a, 6
159166 %result = or i32 %x, %b
176183 ; GFX10-LABEL: shl_or_vgpr_const_scalar2:
177184 ; GFX10: ; %bb.0:
178185 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 6, s2
186 ; GFX10-NEXT: ; implicit-def: $vcc_hi
179187 ; GFX10-NEXT: ; return to shader part epilog
180188 %x = shl i32 %a, 6
181189 %result = or i32 %x, %b
9191 ; GCN-DAG: s_mov_b32 s1, 1
9292 ; GCN-DAG: s_mov_b32 s0, 0
9393 ; SI-NEXT: nop 3
94 ; GFX10-NEXT: ; implicit-def: $vcc_hi
9495 ; GCN-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
9596 define amdgpu_ps float @smrd_hazard(<4 x i32> inreg %desc) #0 {
9697 main_body:
None ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE64 %s
1 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE32 %s
12
23
34 ; GCN-LABEL: {{^}}sub_var_var_i1:
4 ; GCN: s_xor_b64
5 ; WAVE32: s_xor_b32
6 ; WAVE64: s_xor_b64
57 define amdgpu_kernel void @sub_var_var_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
68 %a = load volatile i1, i1 addrspace(1)* %in0
79 %b = load volatile i1, i1 addrspace(1)* %in1
1113 }
1214
1315 ; GCN-LABEL: {{^}}sub_var_imm_i1:
14 ; GCN: s_not_b64
16 ; WAVE32: s_not_b32
17 ; WAVE64: s_not_b64
1518 define amdgpu_kernel void @sub_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) {
1619 %a = load volatile i1, i1 addrspace(1)* %in
1720 %sub = sub i1 %a, 1
2124
2225 ; GCN-LABEL: {{^}}sub_i1_cf:
2326 ; GCN: ; %endif
24 ; GCN: s_not_b64
27 ; WAVE32: s_not_b32
28 ; WAVE64: s_not_b64
2529 define amdgpu_kernel void @sub_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
2630 entry:
2731 %tid = call i32 @llvm.amdgcn.workitem.id.x()
0 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
1 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032,GFX10DEFWAVE %s
5
6 ; GCN-LABEL: {{^}}test_vopc_i32:
7 ; GFX1032: v_cmp_lt_i32_e32 vcc_lo, 0, v{{[0-9]+}}
8 ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, vcc_lo
9 ; GFX1064: v_cmp_lt_i32_e32 vcc, 0, v{{[0-9]+}}
10 ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, vcc{{$}}
11 define amdgpu_kernel void @test_vopc_i32(i32 addrspace(1)* %arg) {
12 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
13 %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %lid
14 %load = load i32, i32 addrspace(1)* %gep, align 4
15 %cmp = icmp sgt i32 %load, 0
16 %sel = select i1 %cmp, i32 1, i32 2
17 store i32 %sel, i32 addrspace(1)* %gep, align 4
18 ret void
19 }
20
21 ; GCN-LABEL: {{^}}test_vopc_f32:
22 ; GFX1032: v_cmp_nge_f32_e32 vcc_lo, 0, v{{[0-9]+}}
23 ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, vcc_lo
24 ; GFX1064: v_cmp_nge_f32_e32 vcc, 0, v{{[0-9]+}}
25 ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, vcc{{$}}
26 define amdgpu_kernel void @test_vopc_f32(float addrspace(1)* %arg) {
27 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
28 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %lid
29 %load = load float, float addrspace(1)* %gep, align 4
30 %cmp = fcmp ugt float %load, 0.0
31 %sel = select i1 %cmp, float 1.0, float 2.0
32 store float %sel, float addrspace(1)* %gep, align 4
33 ret void
34 }
35
36 ; GCN-LABEL: {{^}}test_vopc_vcmpx:
37 ; GFX1032: v_cmpx_le_f32_e32 0, v{{[0-9]+}}
38 ; GFX1064: v_cmpx_le_f32_e32 0, v{{[0-9]+}}
39 define amdgpu_ps void @test_vopc_vcmpx(float %x) {
40 %cmp = fcmp oge float %x, 0.0
41 call void @llvm.amdgcn.kill(i1 %cmp)
42 ret void
43 }
44
45 ; GCN-LABEL: {{^}}test_vopc_2xf16:
46 ; GFX1032: v_cmp_le_f16_sdwa [[SC:s[0-9]+]], {{[vs][0-9]+}}, v{{[0-9]+}} src0_sel:WORD_1 src1_sel:DWORD
47 ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 0x3c003c00, v{{[0-9]+}}, [[SC]]
48 ; GFX1064: v_cmp_le_f16_sdwa [[SC:s\[[0-9:]+\]]], {{[vs][0-9]+}}, v{{[0-9]+}} src0_sel:WORD_1 src1_sel:DWORD
49 ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 0x3c003c00, v{{[0-9]+}}, [[SC]]
50 define amdgpu_kernel void @test_vopc_2xf16(<2 x half> addrspace(1)* %arg) {
51 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
52 %gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i32 %lid
53 %load = load <2 x half>, <2 x half> addrspace(1)* %gep, align 4
54 %elt = extractelement <2 x half> %load, i32 1
55 %cmp = fcmp ugt half %elt, 0.0
56 %sel = select i1 %cmp, <2 x half> , <2 x half> %load
57 store <2 x half> %sel, <2 x half> addrspace(1)* %gep, align 4
58 ret void
59 }
60
61 ; GCN-LABEL: {{^}}test_vopc_class:
62 ; GFX1032: v_cmp_class_f32_e64 [[C:vcc_lo|s[0-9:]+]], s{{[0-9]+}}, 0x204
63 ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[C]]
64 ; GFX1064: v_cmp_class_f32_e64 [[C:vcc|s\[[0-9:]+\]]], s{{[0-9]+}}, 0x204
65 ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[C]]{{$}}
66 define amdgpu_kernel void @test_vopc_class(i32 addrspace(1)* %out, float %x) #0 {
67 %fabs = tail call float @llvm.fabs.f32(float %x)
68 %cmp = fcmp oeq float %fabs, 0x7FF0000000000000
69 %ext = zext i1 %cmp to i32
70 store i32 %ext, i32 addrspace(1)* %out, align 4
71 ret void
72 }
73
74 ; GCN-LABEL: {{^}}test_vcmp_vcnd_f16:
75 ; GFX1032: v_cmp_neq_f16_e64 [[C:vcc_lo|s\[[0-9:]+\]]], 0x7c00, s{{[0-9]+}}
76 ; GFX1032: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3c00, v{{[0-9]+}}, [[C]]
77
78 ; GFX1064: v_cmp_neq_f16_e64 [[C:vcc|s\[[0-9:]+\]]], 0x7c00, s{{[0-9]+}}
79 ; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3c00, v{{[0-9]+}}, [[C]]{{$}}
80 define amdgpu_kernel void @test_vcmp_vcnd_f16(half addrspace(1)* %out, half %x) #0 {
81 %cmp = fcmp oeq half %x, 0x7FF0000000000000
82 %sel = select i1 %cmp, half 1.0, half %x
83 store half %sel, half addrspace(1)* %out, align 2
84 ret void
85 }
86
87 ; GCN-LABEL: {{^}}test_vop3_cmp_f32_sop_and:
88 ; GFX1032: v_cmp_nge_f32_e32 vcc_lo, 0, v{{[0-9]+}}
89 ; GFX1032: v_cmp_nle_f32_e64 [[C2:s[0-9]+]], 1.0, v{{[0-9]+}}
90 ; GFX1032: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[C2]]
91 ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, [[AND]]
92 ; GFX1064: v_cmp_nge_f32_e32 vcc, 0, v{{[0-9]+}}
93 ; GFX1064: v_cmp_nle_f32_e64 [[C2:s\[[0-9:]+\]]], 1.0, v{{[0-9]+}}
94 ; GFX1064: s_and_b64 [[AND:s\[[0-9:]+\]]], vcc, [[C2]]
95 ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, [[AND]]
96 define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(float addrspace(1)* %arg) {
97 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
98 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %lid
99 %load = load float, float addrspace(1)* %gep, align 4
100 %cmp = fcmp ugt float %load, 0.0
101 %cmp2 = fcmp ult float %load, 1.0
102 %and = and i1 %cmp, %cmp2
103 %sel = select i1 %and, float 1.0, float 2.0
104 store float %sel, float addrspace(1)* %gep, align 4
105 ret void
106 }
107
108 ; GCN-LABEL: {{^}}test_vop3_cmp_i32_sop_xor:
109 ; GFX1032: v_cmp_lt_i32_e32 vcc_lo, 0, v{{[0-9]+}}
110 ; GFX1032: v_cmp_gt_i32_e64 [[C2:s[0-9]+]], 1, v{{[0-9]+}}
111 ; GFX1032: s_xor_b32 [[AND:s[0-9]+]], vcc_lo, [[C2]]
112 ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]]
113 ; GFX1064: v_cmp_lt_i32_e32 vcc, 0, v{{[0-9]+}}
114 ; GFX1064: v_cmp_gt_i32_e64 [[C2:s\[[0-9:]+\]]], 1, v{{[0-9]+}}
115 ; GFX1064: s_xor_b64 [[AND:s\[[0-9:]+\]]], vcc, [[C2]]
116 ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]]
117 define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(i32 addrspace(1)* %arg) {
118 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
119 %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %lid
120 %load = load i32, i32 addrspace(1)* %gep, align 4
121 %cmp = icmp sgt i32 %load, 0
122 %cmp2 = icmp slt i32 %load, 1
123 %xor = xor i1 %cmp, %cmp2
124 %sel = select i1 %xor, i32 1, i32 2
125 store i32 %sel, i32 addrspace(1)* %gep, align 4
126 ret void
127 }
128
129 ; GCN-LABEL: {{^}}test_vop3_cmp_u32_sop_or:
130 ; GFX1032: v_cmp_lt_u32_e32 vcc_lo, 3, v{{[0-9]+}}
131 ; GFX1032: v_cmp_gt_u32_e64 [[C2:s[0-9]+]], 2, v{{[0-9]+}}
132 ; GFX1032: s_or_b32 [[AND:s[0-9]+]], vcc_lo, [[C2]]
133 ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]]
134 ; GFX1064: v_cmp_lt_u32_e32 vcc, 3, v{{[0-9]+}}
135 ; GFX1064: v_cmp_gt_u32_e64 [[C2:s\[[0-9:]+\]]], 2, v{{[0-9]+}}
136 ; GFX1064: s_or_b64 [[AND:s\[[0-9:]+\]]], vcc, [[C2]]
137 ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]]
138 define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(i32 addrspace(1)* %arg) {
139 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
140 %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %lid
141 %load = load i32, i32 addrspace(1)* %gep, align 4
142 %cmp = icmp ugt i32 %load, 3
143 %cmp2 = icmp ult i32 %load, 2
144 %or = or i1 %cmp, %cmp2
145 %sel = select i1 %or, i32 1, i32 2
146 store i32 %sel, i32 addrspace(1)* %gep, align 4
147 ret void
148 }
149
150 ; GCN-LABEL: {{^}}test_mask_if:
151 ; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo
152 ; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}}
153 ; GCN: ; mask branch
154 define amdgpu_kernel void @test_mask_if(i32 addrspace(1)* %arg) #0 {
155 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
156 %cmp = icmp ugt i32 %lid, 10
157 br i1 %cmp, label %if, label %endif
158
159 if:
160 store i32 0, i32 addrspace(1)* %arg, align 4
161 br label %endif
162
163 endif:
164 ret void
165 }
166
167 ; GCN-LABEL: {{^}}test_loop_with_if:
168 ; GFX1032: s_or_b32 s{{[0-9]+}}, vcc_lo, s{{[0-9]+}}
169 ; GFX1032: s_andn2_b32 exec_lo, exec_lo, s{{[0-9]+}}
170 ; GFX1064: s_or_b64 s[{{[0-9:]+}}], vcc, s[{{[0-9:]+}}]
171 ; GFX1064: s_andn2_b64 exec, exec, s[{{[0-9:]+}}]
172 ; GCN: s_cbranch_execz
173 ; GCN: BB{{.*}}:
174 ; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo
175 ; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}}
176 ; GCN: s_cbranch_execz
177 ; GCN: BB{{.*}}:
178 ; GCN: BB{{.*}}:
179 ; GFX1032: s_xor_b32 s{{[0-9]+}}, exec_lo, s{{[0-9]+}}
180 ; GFX1064: s_xor_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}]
181 ; GCN: ; mask branch BB
182 ; GCN: BB{{.*}}:
183 ; GCN: BB{{.*}}:
184 ; GFX1032: s_or_b32 exec_lo, exec_lo, s{{[0-9]+}}
185 ; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, s{{[0-9]+}}
186 ; GFX1064: s_or_b64 exec, exec, s[{{[0-9:]+}}]
187 ; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
188 ; GCN: ; mask branch BB
189 ; GCN: BB{{.*}}:
190 ; GCN: BB{{.*}}:
191 ; GCN: s_endpgm
192 define amdgpu_kernel void @test_loop_with_if(i32 addrspace(1)* %arg) #0 {
193 bb:
194 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
195 br label %bb2
196
197 bb1:
198 ret void
199
200 bb2:
201 %tmp3 = phi i32 [ 0, %bb ], [ %tmp15, %bb13 ]
202 %tmp4 = icmp slt i32 %tmp3, %tmp
203 br i1 %tmp4, label %bb5, label %bb11
204
205 bb5:
206 %tmp6 = sext i32 %tmp3 to i64
207 %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6
208 %tmp8 = load i32, i32 addrspace(1)* %tmp7, align 4
209 %tmp9 = icmp sgt i32 %tmp8, 10
210 br i1 %tmp9, label %bb10, label %bb11
211
212 bb10:
213 store i32 %tmp, i32 addrspace(1)* %tmp7, align 4
214 br label %bb13
215
216 bb11:
217 %tmp12 = sdiv i32 %tmp3, 2
218 br label %bb13
219
220 bb13:
221 %tmp14 = phi i32 [ %tmp3, %bb10 ], [ %tmp12, %bb11 ]
222 %tmp15 = add nsw i32 %tmp14, 1
223 %tmp16 = icmp slt i32 %tmp14, 255
224 br i1 %tmp16, label %bb2, label %bb1
225 }
226
227 ; GCN-LABEL: {{^}}test_loop_with_if_else_break:
228 ; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo
229 ; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}}
230 ; GCN: ; mask branch
231 ; GCN: s_cbranch_execz
232 ; GCN: BB{{.*}}:
233 ; GCN: BB{{.*}}:
234 ; GFX1032: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, exec_lo
235 ; GFX1064: s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], exec
236 ; GFX1032: s_or_b32 s{{[0-9]+}}, vcc_lo, s{{[0-9]+}}
237 ; GFX1032: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
238 ; GFX1064: s_or_b64 s[{{[0-9:]+}}], vcc, s[{{[0-9:]+}}]
239 ; GFX1064: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
240 ; GCN: s_cbranch_execz
241 ; GCN: BB{{.*}}:
242 define amdgpu_kernel void @test_loop_with_if_else_break(i32 addrspace(1)* %arg) #0 {
243 bb:
244 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
245 %tmp1 = icmp eq i32 %tmp, 0
246 br i1 %tmp1, label %.loopexit, label %.preheader
247
248 .preheader:
249 br label %bb2
250
251 bb2:
252 %tmp3 = phi i32 [ %tmp9, %bb8 ], [ 0, %.preheader ]
253 %tmp4 = zext i32 %tmp3 to i64
254 %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
255 %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
256 %tmp7 = icmp sgt i32 %tmp6, 10
257 br i1 %tmp7, label %bb8, label %.loopexit
258
259 bb8:
260 store i32 %tmp, i32 addrspace(1)* %tmp5, align 4
261 %tmp9 = add nuw nsw i32 %tmp3, 1
262 %tmp10 = icmp ult i32 %tmp9, 256
263 %tmp11 = icmp ult i32 %tmp9, %tmp
264 %tmp12 = and i1 %tmp10, %tmp11
265 br i1 %tmp12, label %bb2, label %.loopexit
266
267 .loopexit:
268 ret void
269 }
270
271 ; GCN-LABEL: {{^}}test_addc_vop2b:
272 ; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, s{{[0-9]+}}
273 ; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}, vcc_lo
274 ; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, s{{[0-9]+}}
275 ; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}, vcc{{$}}
276 define amdgpu_kernel void @test_addc_vop2b(i64 addrspace(1)* %arg, i64 %arg1) #0 {
277 bb:
278 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
279 %tmp3 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp
280 %tmp4 = load i64, i64 addrspace(1)* %tmp3, align 8
281 %tmp5 = add nsw i64 %tmp4, %arg1
282 store i64 %tmp5, i64 addrspace(1)* %tmp3, align 8
283 ret void
284 }
285
286 ; GCN-LABEL: {{^}}test_subbrev_vop2b:
287 ; GFX1032: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s[0-9]+|vcc_lo]], v{{[0-9]+}}, s{{[0-9]+}}{{$}}
288 ; GFX1032: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[A0]]{{$}}
289 ; GFX1064: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s\[[0-9:]+\]|vcc]], v{{[0-9]+}}, s{{[0-9]+}}{{$}}
290 ; GFX1064: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[A0]]{{$}}
291 define amdgpu_kernel void @test_subbrev_vop2b(i64 addrspace(1)* %arg, i64 %arg1) #0 {
292 bb:
293 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
294 %tmp3 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp
295 %tmp4 = load i64, i64 addrspace(1)* %tmp3, align 8
296 %tmp5 = sub nsw i64 %tmp4, %arg1
297 store i64 %tmp5, i64 addrspace(1)* %tmp3, align 8
298 ret void
299 }
300
301 ; GCN-LABEL: {{^}}test_subb_vop2b:
302 ; GFX1032: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s[0-9]+|vcc_lo]], s{{[0-9]+}}, v{{[0-9]+}}{{$}}
303 ; GFX1032: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, v{{[0-9]+}}, [[A0]]{{$}}
304 ; GFX1064: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s\[[0-9:]+\]|vcc]], s{{[0-9]+}}, v{{[0-9]+}}{{$}}
305 ; GFX1064: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, [[A0]]{{$}}
306 define amdgpu_kernel void @test_subb_vop2b(i64 addrspace(1)* %arg, i64 %arg1) #0 {
307 bb:
308 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
309 %tmp3 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp
310 %tmp4 = load i64, i64 addrspace(1)* %tmp3, align 8
311 %tmp5 = sub nsw i64 %arg1, %tmp4
312 store i64 %tmp5, i64 addrspace(1)* %tmp3, align 8
313 ret void
314 }
315
316 ; GCN-LABEL: {{^}}test_udiv64:
317 ; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, [[SDST:s[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
318 ; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo
319 ; GFX1032: v_add_co_ci_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}, [[SDST]]
320 ; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}
321 ; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}
322 ; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}
323 ; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo
324 ; GFX1032: v_sub_co_u32_e64 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}
325 ; GFX1032: v_sub_co_ci_u32_e64 v{{[0-9]+}}, s{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo
326 ; GFX1032: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo
327 ; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, [[SDST:s\[[0-9:]+\]]], v{{[0-9]+}}, v{{[0-9]+}}
328 ; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
329 ; GFX1064: v_add_co_ci_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, [[SDST]]
330 ; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
331 ; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
332 ; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
333 ; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
334 ; GFX1064: v_sub_co_u32_e64 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
335 ; GFX1064: v_sub_co_ci_u32_e64 v{{[0-9]+}}, s[{{[0-9:]+}}], {{[vs][0-9]+}}, v{{[0-9]+}}, vcc{{$}}
336 ; GFX1064: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc{{$}}
337 define amdgpu_kernel void @test_udiv64(i64 addrspace(1)* %arg) #0 {
338 bb:
339 %tmp = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 1
340 %tmp1 = load i64, i64 addrspace(1)* %tmp, align 8
341 %tmp2 = load i64, i64 addrspace(1)* %arg, align 8
342 %tmp3 = udiv i64 %tmp1, %tmp2
343 %tmp4 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 2
344 store i64 %tmp3, i64 addrspace(1)* %tmp4, align 8
345 ret void
346 }
347
348 ; GCN-LABEL: {{^}}test_div_scale_f32:
349 ; GFX1032: v_div_scale_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
350 ; GFX1064: v_div_scale_f32 v{{[0-9]+}}, s[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
351 define amdgpu_kernel void @test_div_scale_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
352 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
353 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
354 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
355
356 %a = load volatile float, float addrspace(1)* %gep.0, align 4
357 %b = load volatile float, float addrspace(1)* %gep.1, align 4
358
359 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
360 %result0 = extractvalue { float, i1 } %result, 0
361 store float %result0, float addrspace(1)* %out, align 4
362 ret void
363 }
364
365 ; GCN-LABEL: {{^}}test_div_scale_f64:
366 ; GFX1032: v_div_scale_f64 v[{{[0-9:]+}}], s{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
367 ; GFX1064: v_div_scale_f64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
368 define amdgpu_kernel void @test_div_scale_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) #0 {
369 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
370 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
371 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
372
373 %a = load volatile double, double addrspace(1)* %gep.0, align 8
374 %b = load volatile double, double addrspace(1)* %gep.1, align 8
375
376 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
377 %result0 = extractvalue { double, i1 } %result, 0
378 store double %result0, double addrspace(1)* %out, align 8
379 ret void
380 }
381
382 ; GCN-LABEL: {{^}}test_mad_i64_i32:
383 ; GFX1032: v_mad_i64_i32 v[{{[0-9:]+}}], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}]
384 ; GFX1064: v_mad_i64_i32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}]
385 define i64 @test_mad_i64_i32(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
386 %sext0 = sext i32 %arg0 to i64
387 %sext1 = sext i32 %arg1 to i64
388 %mul = mul i64 %sext0, %sext1
389 %mad = add i64 %mul, %arg2
390 ret i64 %mad
391 }
392
393 ; GCN-LABEL: {{^}}test_mad_u64_u32:
394 ; GFX1032: v_mad_u64_u32 v[{{[0-9:]+}}], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}]
395 ; GFX1064: v_mad_u64_u32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}]
396 define i64 @test_mad_u64_u32(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
397 %sext0 = zext i32 %arg0 to i64
398 %sext1 = zext i32 %arg1 to i64
399 %mul = mul i64 %sext0, %sext1
400 %mad = add i64 %mul, %arg2
401 ret i64 %mad
402 }
403
404 ; GCN-LABEL: {{^}}test_div_fmas_f32:
405 ; GFX1032: v_cmp_eq_u32_e64 vcc_lo,
406 ; GFX1064: v_cmp_eq_u32_e64 vcc,
407 ; GCN: v_div_fmas_f32 v{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
408 define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
409 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
410 store float %result, float addrspace(1)* %out, align 4
411 ret void
412 }
413
414 ; GCN-LABEL: {{^}}test_div_fmas_f64:
415 ; GFX1032: v_cmp_eq_u32_e64 vcc_lo,
416 ; GFX1064: v_cmp_eq_u32_e64 vcc,
417 ; GCN-DAG: v_div_fmas_f64 v[{{[0-9:]+}}], {{[vs]}}[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
418 define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
419 %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
420 store double %result, double addrspace(1)* %out, align 8
421 ret void
422 }
423
424 ; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
425 ; GFX1032: s_mov_b32 [[VCC:vcc_lo]], 0{{$}}
426 ; GFX1064: s_mov_b64 [[VCC:vcc]], 0{{$}}
427 ; GFX1032: s_and_saveexec_b32 [[SAVE:s[0-9]+]], s{{[0-9]+}}{{$}}
428 ; GFX1064: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], s[{{[0-9:]+}}]{{$}}
429
430 ; GCN: load_dword [[LOAD:v[0-9]+]]
431 ; GCN: v_cmp_ne_u32_e32 [[VCC]], 0, [[LOAD]]
432
433 ; GCN: BB{{[0-9_]+}}:
434 ; GFX1032: s_or_b32 exec_lo, exec_lo, [[SAVE]]
435 ; GFX1064: s_or_b64 exec, exec, [[SAVE]]
436 ; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
437 define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) #0 {
438 entry:
439 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
440 %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
441 %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
442 %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
443 %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
444
445 %a = load float, float addrspace(1)* %gep.a
446 %b = load float, float addrspace(1)* %gep.b
447 %c = load float, float addrspace(1)* %gep.c
448
449 %cmp0 = icmp eq i32 %tid, 0
450 br i1 %cmp0, label %bb, label %exit
451
452 bb:
453 %val = load volatile i32, i32 addrspace(1)* %dummy
454 %cmp1 = icmp ne i32 %val, 0
455 br label %exit
456
457 exit:
458 %cond = phi i1 [false, %entry], [%cmp1, %bb]
459 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone
460 store float %result, float addrspace(1)* %gep.out, align 4
461 ret void
462 }
463
464 ; GCN-LABEL: {{^}}fdiv_f32:
465 ; GFC1032: v_div_scale_f32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
466 ; GFC1064: v_div_scale_f32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
467 ; GCN: v_rcp_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
468 ; GCN-NOT: vcc
469 ; GCN: v_div_fmas_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
470 define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
471 entry:
472 %fdiv = fdiv float %a, %b
473 store float %fdiv, float addrspace(1)* %out
474 ret void
475 }
476
477 ; GCN-LABEL: {{^}}test_br_cc_f16:
478 ; GFX1032: v_cmp_nlt_f16_e32 vcc_lo,
479 ; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
480 ; GFX1064: v_cmp_nlt_f16_e32 vcc,
481 ; GFX1064-NEXT: s_and_b64 vcc, exec, vcc{{$}}
482 ; GCN-NEXT: s_cbranch_vccnz
483 define amdgpu_kernel void @test_br_cc_f16(
484 half addrspace(1)* %r,
485 half addrspace(1)* %a,
486 half addrspace(1)* %b) {
487 entry:
488 %a.val = load half, half addrspace(1)* %a
489 %b.val = load half, half addrspace(1)* %b
490 %fcmp = fcmp olt half %a.val, %b.val
491 br i1 %fcmp, label %one, label %two
492
493 one:
494 store half %a.val, half addrspace(1)* %r
495 ret void
496
497 two:
498 store half %b.val, half addrspace(1)* %r
499 ret void
500 }
501
502 ; GCN-LABEL: {{^}}test_brcc_i1:
503 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0
504 ; GCN-NEXT: s_cbranch_scc1
505 define amdgpu_kernel void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
506 %cmp0 = icmp ne i1 %val, 0
507 br i1 %cmp0, label %store, label %end
508
509 store:
510 store i32 222, i32 addrspace(1)* %out
511 ret void
512
513 end:
514 ret void
515 }
516
517 ; GCN-LABEL: {{^}}test_preserve_condition_undef_flag:
518 ; GFX1032: v_cmp_nlt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 1.0
519 ; GFX1032: v_cmp_ngt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 0
520 ; GFX1032: v_cmp_nlt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 1.0
521 ; GFX1032: s_or_b32 [[OR1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
522 ; GFX1032: s_or_b32 [[OR2:s[0-9]+]], [[OR1]], s{{[0-9]+}}
523 ; GFX1032: s_and_b32 vcc_lo, exec_lo, [[OR2]]
524 ; GFX1064: v_cmp_nlt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 1.0
525 ; GFX1064: v_cmp_ngt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 0
526 ; GFX1064: v_cmp_nlt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 1.0
527 ; GFX1064: s_or_b64 [[OR1:s\[[0-9:]+\]]], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
528 ; GFX1064: s_or_b64 [[OR2:s\[[0-9:]+\]]], [[OR1]], s[{{[0-9:]+}}]
529 ; GFX1064: s_and_b64 vcc, exec, [[OR2]]
530 ; GCN: s_cbranch_vccnz
531 define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) #0 {
532 bb0:
533 %tmp = icmp sgt i32 %arg1, 4
534 %undef = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef)
535 %tmp4 = select i1 %undef, float %arg, float 1.000000e+00
536 %tmp5 = fcmp ogt float %arg2, 0.000000e+00
537 %tmp6 = fcmp olt float %arg2, 1.000000e+00
538 %tmp7 = fcmp olt float %arg, %tmp4
539 %tmp8 = and i1 %tmp5, %tmp6
540 %tmp9 = and i1 %tmp8, %tmp7
541 br i1 %tmp9, label %bb1, label %bb2
542
543 bb1:
544 store volatile i32 0, i32 addrspace(1)* undef
545 br label %bb2
546
547 bb2:
548 ret void
549 }
550
551 ; GCN-LABEL: {{^}}test_invert_true_phi_cond_break_loop:
552 ; GFX1032: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, -1
553 ; GFX1032: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
554 ; GFX1064: s_xor_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], -1
555 ; GFX1064: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
556 define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 {
557 bb:
558 %id = call i32 @llvm.amdgcn.workitem.id.x()
559 %tmp = sub i32 %id, %arg
560 br label %bb1
561
562 bb1: ; preds = %Flow, %bb
563 %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
564 %lsr.iv.next = add i32 %lsr.iv, 1
565 %cmp0 = icmp slt i32 %lsr.iv.next, 0
566 br i1 %cmp0, label %bb4, label %Flow
567
568 bb4: ; preds = %bb1
569 %load = load volatile i32, i32 addrspace(1)* undef, align 4
570 %cmp1 = icmp sge i32 %tmp, %load
571 br label %Flow
572
573 Flow: ; preds = %bb4, %bb1
574 %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
575 %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ]
576 br i1 %tmp3, label %bb1, label %bb9
577
578 bb9: ; preds = %Flow
579 store volatile i32 7, i32 addrspace(3)* undef
580 ret void
581 }
582
583 ; GCN-LABEL: {{^}}test_movrels_extract_neg_offset_vgpr:
584 ; GFX1032: v_cmp_eq_u32_e32 vcc_lo, 1, v{{[0-9]+}}
585 ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc_lo
586 ; GFX1032: v_cmp_ne_u32_e32 vcc_lo, 2, v{{[0-9]+}}
587 ; GFX1032: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}, vcc_lo
588 ; GFX1032: v_cmp_ne_u32_e32 vcc_lo, 3, v{{[0-9]+}}
589 ; GFX1032: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc_lo
590 ; GFX1064: v_cmp_eq_u32_e32 vcc, 1, v{{[0-9]+}}
591 ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
592 ; GFX1064: v_cmp_ne_u32_e32 vcc, 2, v{{[0-9]+}}
593 ; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}, vcc
594 ; GFX1064: v_cmp_ne_u32_e32 vcc, 3, v{{[0-9]+}}
595 ; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc
596 define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(i32 addrspace(1)* %out) #0 {
597 entry:
598 %id = call i32 @llvm.amdgcn.workitem.id.x() #1
599 %index = add i32 %id, -512
600 %value = extractelement <4 x i32> , i32 %index
601 store i32 %value, i32 addrspace(1)* %out
602 ret void
603 }
604
605 ; GCN-LABEL: {{^}}test_set_inactive:
606 ; GFX1032: s_not_b32 exec_lo, exec_lo
607 ; GFX1032: v_mov_b32_e32 {{v[0-9]+}}, 42
608 ; GFX1032: s_not_b32 exec_lo, exec_lo
609 ; GFX1064: s_not_b64 exec, exec{{$}}
610 ; GFX1064: v_mov_b32_e32 {{v[0-9]+}}, 42
611 ; GFX1064: s_not_b64 exec, exec{{$}}
612 define amdgpu_kernel void @test_set_inactive(i32 addrspace(1)* %out, i32 %in) #0 {
613 %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42)
614 store i32 %tmp, i32 addrspace(1)* %out
615 ret void
616 }
617
618 ; GCN-LABEL: {{^}}test_set_inactive_64:
619 ; GFX1032: s_not_b32 exec_lo, exec_lo
620 ; GFX1032: v_mov_b32_e32 {{v[0-9]+}}, 0
621 ; GFX1032: v_mov_b32_e32 {{v[0-9]+}}, 0
622 ; GFX1032: s_not_b32 exec_lo, exec_lo
623 ; GFX1064: s_not_b64 exec, exec{{$}}
624 ; GFX1064: v_mov_b32_e32 {{v[0-9]+}}, 0
625 ; GFX1064: v_mov_b32_e32 {{v[0-9]+}}, 0
626 ; GFX1064: s_not_b64 exec, exec{{$}}
627 define amdgpu_kernel void @test_set_inactive_64(i64 addrspace(1)* %out, i64 %in) #0 {
628 %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0)
629 store i64 %tmp, i64 addrspace(1)* %out
630 ret void
631 }
632
633 ; GCN-LABEL: {{^}}test_kill_i1_terminator_float:
634 ; GFX1032: s_mov_b32 exec_lo, 0
635 ; GFX1064: s_mov_b64 exec, 0
636 define amdgpu_ps void @test_kill_i1_terminator_float() #0 {
637 call void @llvm.amdgcn.kill(i1 false)
638 ret void
639 }
640
641 ; GCN-LABEL: {{^}}test_kill_i1_terminator_i1:
642 ; GFX1032: s_or_b32 [[OR:s[0-9]+]],
643 ; GFX1032: s_and_b32 exec_lo, exec_lo, [[OR]]
644 ; GFX1064: s_or_b64 [[OR:s\[[0-9:]+\]]],
645 ; GFX1064: s_and_b64 exec, exec, [[OR]]
646 define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d) #0 {
647 %c1 = icmp slt i32 %a, %b
648 %c2 = icmp slt i32 %c, %d
649 %x = or i1 %c1, %c2
650 call void @llvm.amdgcn.kill(i1 %x)
651 ret void
652 }
653
654 ; GCN-LABEL: {{^}}test_loop_vcc:
655 ; GFX1032: v_cmp_lt_f32_e32 vcc_lo,
656 ; GFX1064: v_cmp_lt_f32_e32 vcc,
657 ; GCN: s_cbranch_vccnz
658 define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 {
659 entry:
660 br label %loop
661
662 loop:
663 %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ]
664 %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
665 %cc = fcmp ogt float %ctr.iv, 7.0
666 br i1 %cc, label %break, label %body
667
668 body:
669 %c.iv0 = extractelement <4 x float> %c.iv, i32 0
670 %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)
671 %ctr.next = fadd float %ctr.iv, 2.0
672 br label %loop
673
674 break:
675 ret <4 x float> %c.iv
676 }
677
678 ; GCN-LABEL: {{^}}test_wwm1:
679 ; GFX1032: s_or_saveexec_b32 [[SAVE:s[0-9]+]], -1
680 ; GFX1032: s_mov_b32 exec_lo, [[SAVE]]
681 ; GFX1064: s_or_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], -1
682 ; GFX1064: s_mov_b64 exec, [[SAVE]]
683 define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1, float %src0, float %src1) {
684 main_body:
685 %out = fadd float %src0, %src1
686 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
687 ret float %out.0
688 }
689
690 ; GCN-LABEL: {{^}}test_wwm2:
691 ; GFX1032: v_cmp_gt_u32_e32 vcc_lo, 32, v{{[0-9]+}}
692 ; GFX1032: s_and_saveexec_b32 [[SAVE1:s[0-9]+]], vcc_lo
693 ; GFX1032: s_or_saveexec_b32 [[SAVE2:s[0-9]+]], -1
694 ; GFX1032: s_mov_b32 exec_lo, [[SAVE2]]
695 ; GFX1032: s_or_b32 exec_lo, exec_lo, [[SAVE1]]
696 ; GFX1064: v_cmp_gt_u32_e32 vcc, 32, v{{[0-9]+}}
697 ; GFX1064: s_and_saveexec_b64 [[SAVE1:s\[[0-9:]+\]]], vcc{{$}}
698 ; GFX1064: s_or_saveexec_b64 [[SAVE2:s\[[0-9:]+\]]], -1
699 ; GFX1064: s_mov_b64 exec, [[SAVE2]]
700 ; GFX1064: s_or_b64 exec, exec, [[SAVE1]]
701 define amdgpu_ps float @test_wwm2(i32 inreg %idx) {
702 main_body:
703 ; use mbcnt to make sure the branch is divergent
704 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
705 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
706 %cc = icmp uge i32 %hi, 32
707 br i1 %cc, label %endif, label %if
708
709 if:
710 %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
711 %out = fadd float %src, %src
712 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
713 %out.1 = fadd float %src, %out.0
714 br label %endif
715
716 endif:
717 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
718 ret float %out.2
719 }
720
721 ; GCN-LABEL: {{^}}test_wqm1:
722 ; GFX1032: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo
723 ; GFX1032: s_wqm_b32 exec_lo, exec_lo
724 ; GFX1032: s_and_b32 exec_lo, exec_lo, [[ORIG]]
725 ; GFX1064: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec{{$}}
726 ; GFX1064: s_wqm_b64 exec, exec{{$}}
727 ; GFX1064: s_and_b64 exec, exec, [[ORIG]]
728 define amdgpu_ps <4 x float> @test_wqm1(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #0 {
729 main_body:
730 %inst23 = extractelement <2 x float> %pos, i32 0
731 %inst24 = extractelement <2 x float> %pos, i32 1
732 %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
733 %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
734 %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
735 %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
736 %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0)
737 ret <4 x float> %tex
738 }
739
740 ; GCN-LABEL: {{^}}test_wqm2:
741 ; GFX1032: s_wqm_b32 exec_lo, exec_lo
742 ; GFX1032: s_and_b32 exec_lo, exec_lo, s{{[0-9+]}}
743 ; GFX1064: s_wqm_b64 exec, exec{{$}}
744 ; GFX1064: s_and_b64 exec, exec, s[{{[0-9:]+}}]
745 define amdgpu_ps float @test_wqm2(i32 inreg %idx0, i32 inreg %idx1) #0 {
746 main_body:
747 %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
748 %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
749 %out = fadd float %src0, %src1
750 %out.0 = bitcast float %out to i32
751 %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
752 %out.2 = bitcast i32 %out.1 to float
753 ret float %out.2
754 }
755
756 ; GCN-LABEL: {{^}}test_intr_fcmp_i64:
757 ; GFX1032-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], 0{{$}}
758 ; GFX1032-DAG: v_cmp_eq_f32_e64 s[[C_LO:[0-9]+]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
759 ; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
760 ; GFX1064: v_cmp_eq_f32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
761 ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
762 ; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]]
763 ; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]],
764 define amdgpu_kernel void @test_intr_fcmp_i64(i64 addrspace(1)* %out, float %src, float %a) {
765 %temp = call float @llvm.fabs.f32(float %a)
766 %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1)
767 store i64 %result, i64 addrspace(1)* %out
768 ret void
769 }
770
771 ; GCN-LABEL: {{^}}test_intr_icmp_i64:
772 ; GFX1032-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], 0{{$}}
773 ; GFX1032-DAG: v_cmp_eq_u32_e64 [[C_LO:vcc_lo|s[0-9]+]], 0x64, {{s[0-9]+}}
774 ; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[C_LO]]
775 ; GFX1064: v_cmp_eq_u32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], 0x64, {{s[0-9]+}}
776 ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
777 ; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]]
778 ; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]],
779 define amdgpu_kernel void @test_intr_icmp_i64(i64 addrspace(1)* %out, i32 %src) {
780 %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32)
781 store i64 %result, i64 addrspace(1)* %out
782 ret void
783 }
784
785 ; GCN-LABEL: {{^}}test_intr_fcmp_i32:
786 ; GFX1032-DAG: v_cmp_eq_f32_e64 s[[C_LO:[0-9]+]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
787 ; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
788 ; GFX1064: v_cmp_eq_f32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
789 ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
790 ; GCN: store_dword v[{{[0-9:]+}}], v[[V_LO]],
791 define amdgpu_kernel void @test_intr_fcmp_i32(i32 addrspace(1)* %out, float %src, float %a) {
792 %temp = call float @llvm.fabs.f32(float %a)
793 %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1)
794 store i32 %result, i32 addrspace(1)* %out
795 ret void
796 }
797
798 ; GCN-LABEL: {{^}}test_intr_icmp_i32:
799 ; GFX1032-DAG: v_cmp_eq_u32_e64 s[[C_LO:[0-9]+]], 0x64, {{s[0-9]+}}
800 ; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}}
801 ; GFX1064: v_cmp_eq_u32_e64 s{{\[}}[[C_LO:[0-9]+]]:{{[0-9]+}}], 0x64, {{s[0-9]+}}
802 ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}}
803 ; GCN: store_dword v[{{[0-9:]+}}], v[[V_LO]],
804 define amdgpu_kernel void @test_intr_icmp_i32(i32 addrspace(1)* %out, i32 %src) {
805 %result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 32)
806 store i32 %result, i32 addrspace(1)* %out
807 ret void
808 }
809
810 ; GCN-LABEL: {{^}}test_wqm_vote:
811 ; GFX1032: v_cmp_neq_f32_e32 vcc_lo, 0
812 ; GFX1032: s_wqm_b32 [[WQM:s[0-9]+]], vcc_lo
813 ; GFX1032: s_and_b32 exec_lo, exec_lo, [[WQM]]
814 ; GFX1064: v_cmp_neq_f32_e32 vcc, 0
815 ; GFX1064: s_wqm_b64 [[WQM:s\[[0-9:]+\]]], vcc{{$}}
816 ; GFX1064: s_and_b64 exec, exec, [[WQM]]
817 define amdgpu_ps void @test_wqm_vote(float %a) {
818 %c1 = fcmp une float %a, 0.0
819 %c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1)
820 call void @llvm.amdgcn.kill(i1 %c2)
821 ret void
822 }
823
824 ; GCN-LABEL: {{^}}test_branch_true:
825 ; GFX1032: s_and_b32 vcc_lo, exec_lo, -1
826 ; GFX1064: s_and_b64 vcc, exec, -1
827 define amdgpu_kernel void @test_branch_true() #2 {
828 entry:
829 br i1 true, label %for.end, label %for.body.lr.ph
830
831 for.body.lr.ph: ; preds = %entry
832 br label %for.body
833
834 for.body: ; preds = %for.body, %for.body.lr.ph
835 br i1 undef, label %for.end, label %for.body
836
837 for.end: ; preds = %for.body, %entry
838 ret void
839 }
840
841 ; GCN-LABEL: {{^}}test_ps_live:
842 ; GFX1032: s_mov_b32 [[C:s[0-9]+]], exec_lo
843 ; GFX1064: s_mov_b64 [[C:s\[[0-9:]+\]]], exec{{$}}
844 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[C]]
845 define amdgpu_ps float @test_ps_live() #0 {
846 %live = call i1 @llvm.amdgcn.ps.live()
847 %live.32 = zext i1 %live to i32
848 %r = bitcast i32 %live.32 to float
849 ret float %r
850 }
851
852 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle64:
853 ; GFX1032: v_cmp_neq_f64_e64 [[C:s[0-9]+]], s[{{[0-9:]+}}], 1.0
854 ; GFX1032: s_and_b32 vcc_lo, exec_lo, [[C]]
855 ; GFX1064: v_cmp_neq_f64_e64 [[C:s\[[0-9:]+\]]], s[{{[0-9:]+}}], 1.0
856 ; GFX1064: s_and_b64 vcc, exec, [[C]]
857 define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
858 entry:
859 %v = load double, double addrspace(1)* %in
860 %cc = fcmp oeq double %v, 1.000000e+00
861 br i1 %cc, label %if, label %endif
862
863 if:
864 %u = fadd double %v, %v
865 br label %endif
866
867 endif:
868 %r = phi double [ %v, %entry ], [ %u, %if ]
869 store double %r, double addrspace(1)* %out
870 ret void
871 }
872
873 ; GCN-LABEL: {{^}}test_init_exec:
874 ; GFX1032: s_mov_b32 exec_lo, 0x12345
875 ; GFX1064: s_mov_b64 exec, 0x12345
876 ; GCN: v_add_f32_e32 v0,
877 define amdgpu_ps float @test_init_exec(float %a, float %b) {
878 main_body:
879 %s = fadd float %a, %b
880 call void @llvm.amdgcn.init.exec(i64 74565)
881 ret float %s
882 }
883
884 ; GCN-LABEL: {{^}}test_init_exec_from_input:
885 ; GCN: s_bfe_u32 s0, s3, 0x70008
886 ; GFX1032: s_bfm_b32 exec_lo, s0, 0
887 ; GFX1032: s_cmp_eq_u32 s0, 32
888 ; GFX1032: s_cmov_b32 exec_lo, -1
889 ; GFX1064: s_bfm_b64 exec, s0, 0
890 ; GFX1064: s_cmp_eq_u32 s0, 64
891 ; GFX1064: s_cmov_b64 exec, -1
892 ; GCN: v_add_f32_e32 v0,
893 define amdgpu_ps float @test_init_exec_from_input(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) {
894 main_body:
895 %s = fadd float %a, %b
896 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
897 ret float %s
898 }
899
900 ; GCN-LABEL: {{^}}test_vgprblocks_w32_attr:
901 ; Test that the wave size can be overridden in function attributes and that the block size is correct as a result
902 ; GFX10DEFWAVE: ; VGPRBlocks: 1
903 define amdgpu_gs float @test_vgprblocks_w32_attr(float %a, float %b, float %c, float %d, float %e,
904 float %f, float %g, float %h, float %i, float %j, float %k, float %l) #3 {
905 main_body:
906 %s = fadd float %a, %b
907 %s.1 = fadd float %s, %c
908 %s.2 = fadd float %s.1, %d
909 %s.3 = fadd float %s.2, %e
910 %s.4 = fadd float %s.3, %f
911 %s.5 = fadd float %s.4, %g
912 %s.6 = fadd float %s.5, %h
913 %s.7 = fadd float %s.6, %i
914 %s.8 = fadd float %s.7, %j
915 %s.9 = fadd float %s.8, %k
916 %s.10 = fadd float %s.9, %l
917 ret float %s.10
918 }
919
920 ; GCN-LABEL: {{^}}test_vgprblocks_w64_attr:
921 ; Test that the wave size can be overridden in function attributes and that the block size is correct as a result
922 ; GFX10DEFWAVE: ; VGPRBlocks: 2
923 define amdgpu_gs float @test_vgprblocks_w64_attr(float %a, float %b, float %c, float %d, float %e,
924 float %f, float %g, float %h, float %i, float %j, float %k, float %l) #4 {
925 main_body:
926 %s = fadd float %a, %b
927 %s.1 = fadd float %s, %c
928 %s.2 = fadd float %s.1, %d
929 %s.3 = fadd float %s.2, %e
930 %s.4 = fadd float %s.3, %f
931 %s.5 = fadd float %s.4, %g
932 %s.6 = fadd float %s.5, %h
933 %s.7 = fadd float %s.6, %i
934 %s.8 = fadd float %s.7, %j
935 %s.9 = fadd float %s.8, %k
936 %s.10 = fadd float %s.9, %l
937 ret float %s.10
938 }
939
940 ; GCN-LABEL: {{^}}icmp64:
941 ; GFX1032: v_cmp_eq_u32_e32 vcc_lo, 0, v
942 ; GFX1064: v_cmp_eq_u32_e32 vcc, 0, v
943 define amdgpu_kernel void @icmp64(i32 %n, i32 %s) {
944 entry:
945 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
946 %mul4 = mul nsw i32 %s, %n
947 %cmp = icmp slt i32 0, %mul4
948 br label %if.end
949
950 if.end: ; preds = %entry
951 %rem = urem i32 %id, %s
952 %icmp = tail call i64 @llvm.amdgcn.icmp.i64.i32(i32 %rem, i32 0, i32 32)
953 %shr = lshr i64 %icmp, 1
954 %notmask = shl nsw i64 -1, 0
955 %and = and i64 %notmask, %shr
956 %or = or i64 %and, -9223372036854775808
957 %cttz = tail call i64 @llvm.cttz.i64(i64 %or, i1 true)
958 %cast = trunc i64 %cttz to i32
959 %cmp3 = icmp ugt i32 10, %cast
960 %cmp6 = icmp ne i32 %rem, 0
961 %brmerge = or i1 %cmp6, %cmp3
962 br i1 %brmerge, label %if.end2, label %if.then
963
964 if.then: ; preds = %if.end
965 unreachable
966
967 if.end2: ; preds = %if.end
968 ret void
969 }
970
971 ; GCN-LABEL: {{^}}fcmp64:
972 ; GFX1032: v_cmp_eq_f32_e32 vcc_lo, 0, v
973 ; GFX1064: v_cmp_eq_f32_e32 vcc, 0, v
974 define amdgpu_kernel void @fcmp64(float %n, float %s) {
975 entry:
976 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
977 %id.f = uitofp i32 %id to float
978 %mul4 = fmul float %s, %n
979 %cmp = fcmp ult float 0.0, %mul4
980 br label %if.end
981
982 if.end: ; preds = %entry
983 %rem.f = frem float %id.f, %s
984 %fcmp = tail call i64 @llvm.amdgcn.fcmp.i64.f32(float %rem.f, float 0.0, i32 1)
985 %shr = lshr i64 %fcmp, 1
986 %notmask = shl nsw i64 -1, 0
987 %and = and i64 %notmask, %shr
988 %or = or i64 %and, -9223372036854775808
989 %cttz = tail call i64 @llvm.cttz.i64(i64 %or, i1 true)
990 %cast = trunc i64 %cttz to i32
991 %cmp3 = icmp ugt i32 10, %cast
992 %cmp6 = fcmp one float %rem.f, 0.0
993 %brmerge = or i1 %cmp6, %cmp3
994 br i1 %brmerge, label %if.end2, label %if.then
995
996 if.then: ; preds = %if.end
997 unreachable
998
999 if.end2: ; preds = %if.end
1000 ret void
1001 }
1002
1003 ; GCN-LABEL: {{^}}icmp32:
1004 ; GFX1032: v_cmp_eq_u32_e32 vcc_lo, 0, v
1005 ; GFX1064: v_cmp_eq_u32_e32 vcc, 0, v
1006 define amdgpu_kernel void @icmp32(i32 %n, i32 %s) {
1007 entry:
1008 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
1009 %mul4 = mul nsw i32 %s, %n
1010 %cmp = icmp slt i32 0, %mul4
1011 br label %if.end
1012
1013 if.end: ; preds = %entry
1014 %rem = urem i32 %id, %s
1015 %icmp = tail call i32 @llvm.amdgcn.icmp.i32.i32(i32 %rem, i32 0, i32 32)
1016 %shr = lshr i32 %icmp, 1
1017 %notmask = shl nsw i32 -1, 0
1018 %and = and i32 %notmask, %shr
1019 %or = or i32 %and, 2147483648
1020 %cttz = tail call i32 @llvm.cttz.i32(i32 %or, i1 true)
1021 %cmp3 = icmp ugt i32 10, %cttz
1022 %cmp6 = icmp ne i32 %rem, 0
1023 %brmerge = or i1 %cmp6, %cmp3
1024 br i1 %brmerge, label %if.end2, label %if.then
1025
1026 if.then: ; preds = %if.end
1027 unreachable
1028
1029 if.end2: ; preds = %if.end
1030 ret void
1031 }
1032
1033 ; GCN-LABEL: {{^}}fcmp32:
1034 ; GFX1032: v_cmp_eq_f32_e32 vcc_lo, 0, v
1035 ; GFX1064: v_cmp_eq_f32_e32 vcc, 0, v
1036 define amdgpu_kernel void @fcmp32(float %n, float %s) {
1037 entry:
1038 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
1039 %id.f = uitofp i32 %id to float
1040 %mul4 = fmul float %s, %n
1041 %cmp = fcmp ult float 0.0, %mul4
1042 br label %if.end
1043
1044 if.end: ; preds = %entry
1045 %rem.f = frem float %id.f, %s
1046 %fcmp = tail call i32 @llvm.amdgcn.fcmp.i32.f32(float %rem.f, float 0.0, i32 1)
1047 %shr = lshr i32 %fcmp, 1
1048 %notmask = shl nsw i32 -1, 0
1049 %and = and i32 %notmask, %shr
1050 %or = or i32 %and, 2147483648
1051 %cttz = tail call i32 @llvm.cttz.i32(i32 %or, i1 true)
1052 %cmp3 = icmp ugt i32 10, %cttz
1053 %cmp6 = fcmp one float %rem.f, 0.0
1054 %brmerge = or i1 %cmp6, %cmp3
1055 br i1 %brmerge, label %if.end2, label %if.then
1056
1057 if.then: ; preds = %if.end
1058 unreachable
1059
1060 if.end2: ; preds = %if.end
1061 ret void
1062 }
1063
1064 declare void @external_void_func_void() #1
1065
1066 ; Test save/restore of VGPR needed for SGPR spilling.
1067
1068 ; GCN-LABEL: {{^}}callee_no_stack_with_call:
1069 ; GCN: s_waitcnt
1070 ; GCN: s_mov_b32 s5, s32
1071 ; GFX1064: s_add_u32 s32, s32, 0x400
1072 ; GFX1032: s_add_u32 s32, s32, 0x200
1073
1074 ; GFX1064: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
1075 ; GFX1032: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}}
1076
1077 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
1078
1079 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
1080 ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]]
1081
1082 ; GCN-DAG: v_writelane_b32 v32, s33, 0
1083 ; GCN-DAG: v_writelane_b32 v32, s34, 1
1084 ; GCN-DAG: s_mov_b32 s33, s5
1085 ; GCN: s_swappc_b64
1086 ; GCN-DAG: s_mov_b32 s5, s33
1087 ; GCN-DAG: v_readlane_b32 s34, v32, 1
1088 ; GCN-DAG: v_readlane_b32 s33, v32, 0
1089
1090 ; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
1091 ; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}}
1092 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
1093 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
1094 ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]]
1095
1096 ; GFX1064: s_sub_u32 s32, s32, 0x400
1097 ; GFX1032: s_sub_u32 s32, s32, 0x200
1098 ; GCN: s_setpc_b64
1099 define void @callee_no_stack_with_call() #1 {
1100 call void @external_void_func_void()
1101 ret void
1102 }
1103
1104
1105 declare i32 @llvm.amdgcn.workitem.id.x()
1106 declare float @llvm.fabs.f32(float)
1107 declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1)
1108 declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1)
1109 declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1)
1110 declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1)
1111 declare i1 @llvm.amdgcn.class.f32(float, i32)
1112 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32)
1113 declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64)
1114 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32)
1115 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
1116 declare float @llvm.amdgcn.wwm.f32(float)
1117 declare i32 @llvm.amdgcn.wqm.i32(i32)
1118 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32)
1119 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32)
1120 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1)
1121 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32)
1122 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32)
1123 declare i64 @llvm.amdgcn.fcmp.i64.f32(float, float, i32)
1124 declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32)
1125 declare i32 @llvm.amdgcn.fcmp.i32.f32(float, float, i32)
1126 declare i32 @llvm.amdgcn.icmp.i32.i32(i32, i32, i32)
1127 declare void @llvm.amdgcn.kill(i1)
1128 declare i1 @llvm.amdgcn.wqm.vote(i1)
1129 declare i1 @llvm.amdgcn.ps.live()
1130 declare void @llvm.amdgcn.init.exec(i64)
1131 declare void @llvm.amdgcn.init.exec.from.input(i32, i32)
1132 declare i64 @llvm.cttz.i64(i64, i1)
1133 declare i32 @llvm.cttz.i32(i32, i1)
1134
1135 attributes #0 = { nounwind readnone speculatable }
1136 attributes #1 = { nounwind }
1137 attributes #2 = { nounwind readnone optnone noinline }
1138 attributes #3 = { "target-features"="+wavefrontsize32" }
1139 attributes #4 = { "target-features"="+wavefrontsize64" }
1515 ; GFX10-LABEL: xor3:
1616 ; GFX10: ; %bb.0:
1717 ; GFX10-NEXT: v_xor3_b32 v0, v0, v1, v2
18 ; GFX10-NEXT: ; implicit-def: $vcc_hi
1819 ; GFX10-NEXT: ; return to shader part epilog
1920 %x = xor i32 %a, %b
2021 %result = xor i32 %x, %c
3233 ; GFX10-LABEL: xor3_vgpr_b:
3334 ; GFX10: ; %bb.0:
3435 ; GFX10-NEXT: v_xor3_b32 v0, s2, v0, s3
36 ; GFX10-NEXT: ; implicit-def: $vcc_hi
3537 ; GFX10-NEXT: ; return to shader part epilog
3638 %x = xor i32 %a, %b
3739 %result = xor i32 %x, %c
4951 ; GFX10-LABEL: xor3_vgpr_all2:
5052 ; GFX10: ; %bb.0:
5153 ; GFX10-NEXT: v_xor3_b32 v0, v1, v2, v0
54 ; GFX10-NEXT: ; implicit-def: $vcc_hi
5255 ; GFX10-NEXT: ; return to shader part epilog
5356 %x = xor i32 %b, %c
5457 %result = xor i32 %a, %x
6669 ; GFX10-LABEL: xor3_vgpr_bc:
6770 ; GFX10: ; %bb.0:
6871 ; GFX10-NEXT: v_xor3_b32 v0, s2, v0, v1
72 ; GFX10-NEXT: ; implicit-def: $vcc_hi
6973 ; GFX10-NEXT: ; return to shader part epilog
7074 %x = xor i32 %a, %b
7175 %result = xor i32 %x, %c
8387 ; GFX10-LABEL: xor3_vgpr_const:
8488 ; GFX10: ; %bb.0:
8589 ; GFX10-NEXT: v_xor3_b32 v0, v0, v1, 16
90 ; GFX10-NEXT: ; implicit-def: $vcc_hi
8691 ; GFX10-NEXT: ; return to shader part epilog
8792 %x = xor i32 %a, %b
8893 %result = xor i32 %x, 16
101106 ; GFX10-LABEL: xor3_multiuse_outer:
102107 ; GFX10: ; %bb.0:
103108 ; GFX10-NEXT: v_xor3_b32 v0, v0, v1, v2
109 ; GFX10-NEXT: ; implicit-def: $vcc_hi
104110 ; GFX10-NEXT: v_mul_lo_u32 v1, v0, v3
105111 ; GFX10-NEXT: ; return to shader part epilog
106112 %inner = xor i32 %a, %b
122128 ; GFX10-LABEL: xor3_multiuse_inner:
123129 ; GFX10: ; %bb.0:
124130 ; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1
131 ; GFX10-NEXT: ; implicit-def: $vcc_hi
125132 ; GFX10-NEXT: v_xor_b32_e32 v1, v0, v2
126133 ; GFX10-NEXT: ; return to shader part epilog
127134 %inner = xor i32 %a, %b
150157 ; GFX10-NEXT: v_add_f32_e64 v1, s3, 2.0
151158 ; GFX10-NEXT: v_add_f32_e64 v2, s2, 1.0
152159 ; GFX10-NEXT: v_add_f32_e64 v0, 0x40400000, s4
160 ; GFX10-NEXT: ; implicit-def: $vcc_hi
153161 ; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1
154162 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
155163 ; GFX10-NEXT: ; return to shader part epilog
2121 ; GFX10-LABEL: xor_add:
2222 ; GFX10: ; %bb.0:
2323 ; GFX10-NEXT: v_xad_u32 v0, v0, v1, v2
24 ; GFX10-NEXT: ; implicit-def: $vcc_hi
2425 ; GFX10-NEXT: ; return to shader part epilog
2526 %x = xor i32 %a, %b
2627 %result = add i32 %x, %c
4546 ; GFX10-LABEL: xor_add_vgpr_a:
4647 ; GFX10: ; %bb.0:
4748 ; GFX10-NEXT: v_xad_u32 v0, v0, s2, s3
49 ; GFX10-NEXT: ; implicit-def: $vcc_hi
4850 ; GFX10-NEXT: ; return to shader part epilog
4951 %x = xor i32 %a, %b
5052 %result = add i32 %x, %c
6769 ; GFX10-LABEL: xor_add_vgpr_all:
6870 ; GFX10: ; %bb.0:
6971 ; GFX10-NEXT: v_xad_u32 v0, v0, v1, v2
72 ; GFX10-NEXT: ; implicit-def: $vcc_hi
7073 ; GFX10-NEXT: ; return to shader part epilog
7174 %x = xor i32 %a, %b
7275 %result = add i32 %x, %c
8992 ; GFX10-LABEL: xor_add_vgpr_ab:
9093 ; GFX10: ; %bb.0:
9194 ; GFX10-NEXT: v_xad_u32 v0, v0, v1, s2
95 ; GFX10-NEXT: ; implicit-def: $vcc_hi
9296 ; GFX10-NEXT: ; return to shader part epilog
9397 %x = xor i32 %a, %b
9498 %result = add i32 %x, %c
111115 ; GFX10-LABEL: xor_add_vgpr_const:
112116 ; GFX10: ; %bb.0:
113117 ; GFX10-NEXT: v_xad_u32 v0, v0, 3, v1
118 ; GFX10-NEXT: ; implicit-def: $vcc_hi
114119 ; GFX10-NEXT: ; return to shader part epilog
115120 %x = xor i32 %a, 3
116121 %result = add i32 %x, %b
3232
3333 v_div_fmas_f64 v[5:6], v[1:2], s[2:3], 0x123456
3434 // GFX10-ERR: error: invalid operand (violates constant bus restrictions)
35
36 //-----------------------------------------------------------------------------------------
37 // v_mad_u64_u32 has operands of different sizes.
38 // When these operands are literals, they are counted as 2 scalar values even if literals are identical.
39
40 v_mad_u64_u32 v[5:6], s12, v1, 0x12345678, 0x12345678
41 // GFX10: v_mad_u64_u32 v[5:6], s12, v1, 0x12345678, 0x12345678 ; encoding: [0x05,0x0c,0x76,0xd5,0x01,0xff,0xfd,0x03,0x78,0x56,0x34,0x12]
42
43 v_mad_u64_u32 v[5:6], s12, s1, 0x12345678, 0x12345678
44 // GFX10-ERR: error: invalid operand (violates constant bus restrictions)
0 // RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck -check-prefix=GFX1032 %s
1 // RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck -check-prefix=GFX1064 %s
2 // RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s 2>&1 | FileCheck -check-prefix=GFX1032-ERR %s
3 // RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck -check-prefix=GFX1064-ERR %s
4
5 v_cmp_ge_i32_e32 s0, v0
6 // GFX1032: v_cmp_ge_i32_e32 vcc_lo, s0, v0 ; encoding: [0x00,0x00,0x0c,0x7d]
7 // GFX1064: v_cmp_ge_i32_e32 vcc, s0, v0 ; encoding: [0x00,0x00,0x0c,0x7d]
8
9 v_cmp_ge_i32_e32 vcc_lo, s0, v1
10 // GFX1032: v_cmp_ge_i32_e32 vcc_lo, s0, v1 ; encoding: [0x00,0x02,0x0c,0x7d]
11 // GFX1064-ERR: error: instruction not supported on this GPU
12
13 v_cmp_ge_i32_e32 vcc, s0, v2
14 // GFX1032-ERR: error: instruction not supported on this GPU
15 // GFX1064: v_cmp_ge_i32_e32 vcc, s0, v2 ; encoding: [0x00,0x04,0x0c,0x7d]
16
17 v_cmp_le_f16_sdwa s0, v3, v4 src0_sel:WORD_1 src1_sel:DWORD
18 // GFX1032: v_cmp_le_f16_sdwa s0, v3, v4 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x08,0x96,0x7d,0x03,0x80,0x05,0x06]
19 // GFX1064-ERR: error: invalid operand for instruction
20
21 v_cmp_le_f16_sdwa s[0:1], v3, v4 src0_sel:WORD_1 src1_sel:DWORD
22 // GFX1032-ERR: error: invalid operand for instruction
23 // GFX1064: v_cmp_le_f16_sdwa s[0:1], v3, v4 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x08,0x96,0x7d,0x03,0x80,0x05,0x06]
24
25 v_cmp_class_f32_e32 vcc_lo, s0, v0
26 // GFX1032: v_cmp_class_f32_e32 vcc_lo, s0, v0 ; encoding: [0x00,0x00,0x10,0x7d]
27 // GFX1064-ERR: error: instruction not supported on this GPU
28
29 v_cmp_class_f32_e32 vcc, s0, v0
30 // GFX1032-ERR: error: instruction not supported on this GPU
31 // GFX1064: v_cmp_class_f32_e32 vcc, s0, v0 ; encoding: [0x00,0x00,0x10,0x7d]
32
33 // TODO-GFX10: The following encoding does not match SP3's encoding, which is:
34 // [0xf9,0x04,0x1e,0x7d,0x01,0x06,0x06,0x06]
35 v_cmp_class_f16_sdwa vcc_lo, v1, v2 src0_sel:DWORD src1_sel:DWORD
36 // GFX1032: v_cmp_class_f16_sdwa vcc_lo, v1, v2 src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x04,0x1e,0x7d,0x01,0x00,0x06,0x06]
37 // GFX1064-ERR: error: invalid operand for instruction
38
39 // TODO-GFX10: The following encoding does not match SP3's encoding, which is:
40 // [0xf9,0x04,0x1e,0x7d,0x01,0x06,0x06,0x06]
41 v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
42 // GFX1032-ERR: error: instruction not supported on this GPU
43 // GFX1064: v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x04,0x1e,0x7d,0x01,0x00,0x06,0x06]
44
45 v_cmp_class_f16_sdwa s0, v1, v2 src0_sel:DWORD src1_sel:DWORD
46 // GFX1032: v_cmp_class_f16_sdwa s0, v1, v2 src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x04,0x1e,0x7d,0x01,0x80,0x06,0x06]
47 // GFX1064-ERR: error: invalid operand for instruction
48
49 v_cmp_class_f16_sdwa s[0:1], v1, v2 src0_sel:DWORD src1_sel:DWORD
50 // GFX1032-ERR: error: invalid operand for instruction
51 // GFX1064: v_cmp_class_f16_sdwa s[0:1], v1, v2 src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x04,0x1e,0x7d,0x01,0x80,0x06,0x06]
52
53 v_cndmask_b32_e32 v1, v2, v3,
54 // GFX1032: v_cndmask_b32_e32 v1, v2, v3, vcc_lo ; encoding: [0x02,0x07,0x02,0x02]
55 // GFX1064: v_cndmask_b32_e32 v1, v2, v3, vcc ; encoding: [0x02,0x07,0x02,0x02]
56
57 v_cndmask_b32_e32 v1, v2, v3, vcc_lo
58 // GFX1032: v_cndmask_b32_e32 v1, v2, v3, vcc_lo ; encoding: [0x02,0x07,0x02,0x02]
59 // GFX1064-ERR: error: instruction not supported on this GPU
60
61 v_cndmask_b32_e32 v1, v2, v3, vcc
62 // GFX1032-ERR: error: instruction not supported on this GPU
63 // GFX1064: v_cndmask_b32_e32 v1, v2, v3, vcc ; encoding: [0x02,0x07,0x02,0x02]
64
65 v_add_co_u32_e32 v2, vcc_lo, s0, v2
66 // GFX1032-ERR: error: instruction not supported on this GPU
67 // GFX1064-ERR: error: instruction not supported on this GPU
68
69 v_add_co_u32_e32 v2, vcc, s0, v2
70 // GFX1032-ERR: error: instruction not supported on this GPU
71 // GFX1064-ERR: error: instruction not supported on this GPU
72
73 v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
74 // GFX1032: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo ; encoding: [0x03,0x09,0x06,0x50]
75 // GFX1064-ERR: error: instruction not supported on this GPU
76
77 v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
78 // GFX1032-ERR: error: instruction not supported on this GPU
79 // GFX1064: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc ; encoding: [0x03,0x09,0x06,0x50]
80
81 v_add_co_ci_u32_e32 v3, v3, v4
82 // GFX1032: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo ; encoding: [0x03,0x09,0x06,0x50]
83 // GFX1064: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc ; encoding: [0x03,0x09,0x06,0x50]
84
85 v_sub_co_u32_e32 v2, vcc_lo, s0, v2
86 // GFX1032-ERR: error: instruction not supported on this GPU
87 // GFX1064-ERR: error: instruction not supported on this GPU
88
89 v_sub_co_u32_e32 v2, vcc, s0, v2
90 // GFX1032-ERR: error: instruction not supported on this GPU
91 // GFX1064-ERR: error: instruction not supported on this GPU
92
93 v_subrev_co_u32_e32 v2, vcc_lo, s0, v2
94 // GFX1032-ERR: error: instruction not supported on this GPU
95 // GFX1064-ERR: error: instruction not supported on this GPU
96
97 v_subrev_co_u32_e32 v2, vcc, s0, v2
98 // GFX1032-ERR: error: instruction not supported on this GPU
99 // GFX1064-ERR: error: instruction not supported on this GPU
100
101 v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
102 // GFX1032: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo ; encoding: [0x03,0x09,0x06,0x52]
103 // GFX1064-ERR: error: instruction not supported on this GPU
104
105 v_sub_co_ci_u32_e32 v3, vcc, v3, v4, vcc
106 // GFX1032-ERR: error: instruction not supported on this GPU
107 // GFX1064: v_sub_co_ci_u32_e32 v3, vcc, v3, v4, vcc ; encoding: [0x03,0x09,0x06,0x52]
108
109 v_sub_co_ci_u32_e32 v3, v3, v4
110 // GFX1032: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo ; encoding: [0x03,0x09,0x06,0x52]
111 // GFX1064: v_sub_co_ci_u32_e32 v3, vcc, v3, v4, vcc ; encoding: [0x03,0x09,0x06,0x52]
112
113 v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
114 // GFX1032: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; encoding: [0x80,0x02,0x02,0x54]
115 // GFX1064-ERR: error: instruction not supported on this GPU
116
117 v_subrev_co_ci_u32_e32 v1, vcc, 0, v1, vcc
118 // GFX1032-ERR: error: instruction not supported on this GPU
119 // GFX1064: v_subrev_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; encoding: [0x80,0x02,0x02,0x54]
120
121 v_subrev_co_ci_u32_e32 v1, 0, v1
122 // GFX1032: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; encoding: [0x80,0x02,0x02,0x54]
123 // GFX1064: v_subrev_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; encoding: [0x80,0x02,0x02,0x54]
124
125 v_add_co_u32_sdwa v0, vcc_lo, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
126 // GFX1032-ERR: error: invalid operand
127 // GFX1064-ERR: error: invalid operand
128
129 v_add_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
130 // GFX1032-ERR: error: instruction not supported
131 // GFX1064-ERR: error: instruction not supported
132
133 v_add_co_u32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
134 // GFX1032-ERR: error: not a valid operand
135 // GFX1064-ERR: error: not a valid operand
136
137 v_add_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
138 // GFX1032: v_add_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x50,0x01,0x06,0x00,0x06]
139 // GFX1064-ERR: error: instruction not supported on this GPU
140
141 v_add_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
142 // GFX1032-ERR: error: instruction not supported on this GPU
143 // GFX1064: v_add_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x50,0x01,0x06,0x00,0x06]
144
145 v_add_co_ci_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
146 // GFX1032: v_add_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x50,0x01,0x06,0x00,0x06]
147 // GFX1064: v_add_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x50,0x01,0x06,0x00,0x06]
148
149 v_sub_co_u32_sdwa v0, vcc_lo, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
150 // GFX1032-ERR: error: invalid operand
151 // GFX1064-ERR: error: invalid operand
152
153 v_sub_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
154 // GFX1032-ERR: error: instruction not supported
155 // GFX1064-ERR: error: instruction not supported
156
157 v_sub_co_u32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
158 // GFX1032-ERR: error: not a valid operand
159 // GFX1064-ERR: error: not a valid operand
160
161 v_subrev_co_u32_sdwa v0, vcc_lo, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
162 // GFX1032-ERR: error: invalid operand
163 // GFX1064-ERR: error: invalid operand
164
165 v_subrev_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
166 // GFX1032-ERR: error: instruction not supported
167 // GFX1064-ERR: error: instruction not supported
168
169 v_subrev_co_u32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
170 // GFX1032-ERR: error: not a valid operand
171 // GFX1064-ERR: error: not a valid operand
172
173 v_sub_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
174 // GFX1032: v_sub_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x52,0x01,0x06,0x00,0x06]
175 // GFX1064-ERR: error: instruction not supported on this GPU
176
177 v_sub_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
178 // GFX1032-ERR: error: instruction not supported on this GPU
179 // GFX1064: v_sub_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x52,0x01,0x06,0x00,0x06]
180
181 v_sub_co_ci_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
182 // GFX1032: v_sub_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x52,0x01,0x06,0x00,0x06]
183 // GFX1064: v_sub_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x52,0x01,0x06,0x00,0x06]
184
185 v_subrev_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
186 // GFX1032: v_subrev_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x54,0x01,0x06,0x00,0x06]
187 // GFX1064-ERR: error: instruction not supported on this GPU
188
189 v_subrev_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
190 // GFX1032-ERR: error: instruction not supported on this GPU
191 // GFX1064: v_subrev_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x54,0x01,0x06,0x00,0x06]
192
193 v_subrev_co_ci_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
194 // GFX1032: v_subrev_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x54,0x01,0x06,0x00,0x06]
195 // GFX1064: v_subrev_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x54,0x01,0x06,0x00,0x06]
196
197 v_add_co_ci_u32 v1, sext(v1), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
198 // GFX1032: v_add_co_ci_u32_sdwa v1, vcc_lo, sext(v1), sext(v4), vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x50,0x01,0x06,0x08,0x0e]
199 // GFX1064: v_add_co_ci_u32_sdwa v1, vcc, sext(v1), sext(v4), vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x50,0x01,0x06,0x08,0x0e]
200
201 v_add_co_ci_u32_sdwa v1, vcc_lo, sext(v1), sext(v4), vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
202 // GFX1032: v_add_co_ci_u32_sdwa v1, vcc_lo, sext(v1), sext(v4), vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x50,0x01,0x06,0x08,0x0e]
203 // GFX1064-ERR: error: instruction not supported on this GPU
204
205 v_add_co_ci_u32_sdwa v1, vcc, sext(v1), sext(v4), vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
206 // GFX1032-ERR: error: instruction not supported on this GPU
207 // GFX1064: v_add_co_ci_u32_sdwa v1, vcc, sext(v1), sext(v4), vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x08,0x02,0x50,0x01,0x06,0x08,0x0e]
208
209 v_add_co_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
210 // GFX1032-ERR: error: not a valid operand
211 // GFX1064-ERR: error: not a valid operand
212
213 v_add_co_u32_dpp v5, vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
214 // GFX1032-ERR: error: not a valid operand
215 // GFX1064-ERR: error: not a valid operand
216
217 v_add_co_u32_dpp v5, vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
218 // GFX1032-ERR: error: not a valid operand
219 // GFX1064-ERR: error: not a valid operand
220
221 v_add_co_ci_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
222 // GFX1032: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x00]
223 // GFX1064: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x00]
224
225 v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
226 // GFX1032: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x00]
227 // GFX1064-ERR: error: instruction not supported on this GPU
228
229 v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
230 // GFX1032-ERR: error: instruction not supported on this GPU
231 // GFX1064: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x00]
232
233 v_sub_co_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
234 // GFX1032-ERR: error: not a valid operand
235 // GFX1064-ERR: error: not a valid operand
236
237 v_sub_co_u32_dpp v5, vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
238 // GFX1032-ERR: error: not a valid operand
239 // GFX1064-ERR: error: not a valid operand
240
241 v_sub_co_u32_dpp v5, vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
242 // GFX1032-ERR: error: not a valid operand
243 // GFX1064-ERR: error: not a valid operand
244
245 v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
246 // GFX1032: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x52,0x01,0xe4,0x00,0x00]
247 // GFX1064-ERR: error: instruction not supported on this GPU
248
249 v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
250 // GFX1032-ERR: error: instruction not supported on this GPU
251 // GFX1064: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x52,0x01,0xe4,0x00,0x00]
252
253 v_subrev_co_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
254 // GFX1032-ERR: error: not a valid operand
255 // GFX1064-ERR: error: not a valid operand
256
257 v_subrev_co_u32_dpp v5, vcc_lo, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
258 // GFX1032-ERR: error: not a valid operand
259 // GFX1064-ERR: error: not a valid operand
260
261 v_subrev_co_u32_dpp v5, vcc, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
262 // GFX1032-ERR: error: not a valid operand
263 // GFX1064-ERR: error: not a valid operand
264
265 v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
266 // GFX1032: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x00]
267 // GFX1064-ERR: error: instruction not supported on this GPU
268
269 v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
270 // GFX1032-ERR: error: instruction not supported on this GPU
271 // GFX1064: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x00]
272
273 v_add_co_u32 v0, s0, v0, v2
274 // GFX1032: v_add_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
275 // GFX1064-ERR: error: invalid operand for instruction
276
277 v_add_co_u32_e64 v0, s0, v0, v2
278 // GFX1032: v_add_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
279 // GFX1064-ERR: error: invalid operand for instruction
280
281 v_add_co_ci_u32_e64 v4, s0, v1, v5, s2
282 // GFX1032: v_add_co_ci_u32_e64 v4, s0, v1, v5, s2 ; encoding: [0x04,0x00,0x28,0xd5,0x01,0x0b,0x0a,0x00]
283 // GFX1064-ERR: error: invalid operand for instruction
284
285 v_sub_co_u32 v0, s0, v0, v2
286 // GFX1032: v_sub_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
287 // GFX1064-ERR: error: invalid operand for instruction
288
289 v_sub_co_u32_e64 v0, s0, v0, v2
290 // GFX1032: v_sub_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
291 // GFX1064-ERR: error: invalid operand for instruction
292
293 v_sub_co_ci_u32_e64 v4, s0, v1, v5, s2
294 // GFX1032: v_sub_co_ci_u32_e64 v4, s0, v1, v5, s2 ; encoding: [0x04,0x00,0x29,0xd5,0x01,0x0b,0x0a,0x00]
295 // GFX1064-ERR: error: invalid operand for instruction
296
297 v_subrev_co_u32 v0, s0, v0, v2
298 // GFX1032: v_subrev_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
299 // GFX1064-ERR: error: invalid operand for instruction
300
301 v_subrev_co_u32_e64 v0, s0, v0, v2
302 // GFX1032: v_subrev_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
303 // GFX1064-ERR: error: invalid operand for instruction
304
305 v_subrev_co_ci_u32_e64 v4, s0, v1, v5, s2
306 // GFX1032: v_subrev_co_ci_u32_e64 v4, s0, v1, v5, s2 ; encoding: [0x04,0x00,0x2a,0xd5,0x01,0x0b,0x0a,0x00]
307 // GFX1064-ERR: error: invalid operand for instruction
308
309 v_add_co_u32 v0, s[0:1], v0, v2
310 // GFX1032-ERR: error: invalid operand for instruction
311 // GFX1064: v_add_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
312
313 v_add_co_u32_e64 v0, s[0:1], v0, v2
314 // GFX1032-ERR: error: invalid operand for instruction
315 // GFX1064: v_add_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
316
317 v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
318 // GFX1032-ERR: error: invalid operand for instruction
319 // GFX1064: v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3] ; encoding: [0x04,0x00,0x28,0xd5,0x01,0x0b,0x0a,0x00]
320
321 v_sub_co_u32 v0, s[0:1], v0, v2
322 // GFX1032-ERR: error: invalid operand for instruction
323 // GFX1064: v_sub_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
324
325 v_sub_co_u32_e64 v0, s[0:1], v0, v2
326 // GFX1032-ERR: error: invalid operand for instruction
327 // GFX1064: v_sub_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
328
329 v_sub_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
330 // GFX1032-ERR: error: invalid operand for instruction
331 // GFX1064: v_sub_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3] ; encoding: [0x04,0x00,0x29,0xd5,0x01,0x0b,0x0a,0x00]
332
333 v_subrev_co_u32 v0, s[0:1], v0, v2
334 // GFX1032-ERR: error: invalid operand for instruction
335 // GFX1064: v_subrev_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
336
337 v_subrev_co_u32_e64 v0, s[0:1], v0, v2
338 // GFX1032-ERR: error: invalid operand for instruction
339 // GFX1064: v_subrev_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
340
341 v_subrev_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
342 // GFX1032-ERR: error: invalid operand for instruction
343 // GFX1064: v_subrev_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3] ; encoding: [0x04,0x00,0x2a,0xd5,0x01,0x0b,0x0a,0x00]
344
345 v_add_co_ci_u32_e64 v4, vcc_lo, v1, v5, s2
346 // GFX1032: v_add_co_ci_u32_e64 v4, vcc_lo, v1, v5, s2 ; encoding: [0x04,0x6a,0x28,0xd5,0x01,0x0b,0x0a,0x00]
347 // GFX1064-ERR: error: invalid operand for instruction
348
349 v_add_co_ci_u32_e64 v4, vcc, v1, v5, s[2:3]
350 // GFX1032-ERR: error: invalid operand for instruction
351 // GFX1064: v_add_co_ci_u32_e64 v4, vcc, v1, v5, s[2:3] ; encoding: [0x04,0x6a,0x28,0xd5,0x01,0x0b,0x0a,0x00]
352
353 v_add_co_ci_u32_e64 v4, s0, v1, v5, vcc_lo
354 // GFX1032: v_add_co_ci_u32_e64 v4, s0, v1, v5, vcc_lo ; encoding: [0x04,0x00,0x28,0xd5,0x01,0x0b,0xaa,0x01]
355 // GFX1064-ERR: error: invalid operand for instruction
356
357 v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, vcc
358 // GFX1032-ERR: error: invalid operand for instruction
359 // GFX1064: v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, vcc ; encoding: [0x04,0x00,0x28,0xd5,0x01,0x0b,0xaa,0x01]
360
361 v_div_scale_f32 v2, s2, v0, v0, v2
362 // GFX1032: v_div_scale_f32 v2, s2, v0, v0, v2 ; encoding: [0x02,0x02,0x6d,0xd5,0x00,0x01,0x0a,0x04]
363 // GFX1064-ERR: error: invalid operand for instruction
364
365 v_div_scale_f32 v2, s[2:3], v0, v0, v2
366 // GFX1032-ERR: error: invalid operand for instruction
367 // GFX1064: v_div_scale_f32 v2, s[2:3], v0, v0, v2 ; encoding: [0x02,0x02,0x6d,0xd5,0x00,0x01,0x0a,0x04]
368
369 v_div_scale_f64 v[2:3], s2, v[0:1], v[0:1], v[2:3]
370 // GFX1032: v_div_scale_f64 v[2:3], s2, v[0:1], v[0:1], v[2:3] ; encoding: [0x02,0x02,0x6e,0xd5,0x00,0x01,0x0a,0x04]
371 // GFX1064-ERR: error: invalid operand for instruction
372
373 v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], v[2:3]
374 // GFX1032-ERR: error: invalid operand for instruction
375 // GFX1064: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], v[2:3] ; encoding: [0x02,0x02,0x6e,0xd5,0x00,0x01,0x0a,0x04]
376
377 v_mad_i64_i32 v[0:1], s6, v0, v1, v[2:3]
378 // GFX1032: v_mad_i64_i32 v[0:1], s6, v0, v1, v[2:3] ; encoding: [0x00,0x06,0x77,0xd5,0x00,0x03,0x0a,0x04]
379 // GFX1064-ERR: error: invalid operand for instruction
380
381 v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
382 // GFX1032-ERR: error: invalid operand for instruction
383 // GFX1064: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3] ; encoding: [0x00,0x06,0x77,0xd5,0x00,0x03,0x0a,0x04]
384
385 v_mad_u64_u32 v[0:1], s6, v0, v1, v[2:3]
386 // GFX1032: v_mad_u64_u32 v[0:1], s6, v0, v1, v[2:3] ; encoding: [0x00,0x06,0x76,0xd5,0x00,0x03,0x0a,0x04]
387 // GFX1064-ERR: error: invalid operand for instruction
388
389 v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3]
390 // GFX1032-ERR: error: invalid operand for instruction
391 // GFX1064: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3] ; encoding: [0x00,0x06,0x76,0xd5,0x00,0x03,0x0a,0x04]
392
393 v_cmpx_neq_f32_e32 v0, v1
394 // GFX1032: v_cmpx_neq_f32_e32 v0, v1 ; encoding: [0x00,0x03,0x3a,0x7c]
395 // GFX1064: v_cmpx_neq_f32_e32 v0, v1 ; encoding: [0x00,0x03,0x3a,0x7c]
396
397 v_cmpx_neq_f32_sdwa v0, v1 src0_sel:WORD_1 src1_sel:DWORD
398 // GFX1032: v_cmpx_neq_f32_sdwa v0, v1 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0x3a,0x7c,0x00,0x00,0x05,0x06]
399 // GFX1064: v_cmpx_neq_f32_sdwa v0, v1 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0x3a,0x7c,0x00,0x00,0x05,0x06]
400
401 v_cmpx_eq_u32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD
402 // GFX1032: v_cmpx_eq_u32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0xa5,0x7d,0x00,0x00,0x05,0x86]
403 // GFX1064: v_cmpx_eq_u32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0xa5,0x7d,0x00,0x00,0x05,0x86]
404
405 v_cmpx_class_f32_e64 v0, 1
406 // GFX1032: v_cmpx_class_f32_e64 v0, 1 ; encoding: [0x00,0x00,0x98,0xd4,0x00,0x03,0x01,0x00]
407 // GFX1064: v_cmpx_class_f32_e64 v0, 1 ; encoding: [0x00,0x00,0x98,0xd4,0x00,0x03,0x01,0x00]
408
409 v_cmpx_class_f32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD
410 // GFX1032: v_cmpx_class_f32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0x31,0x7d,0x00,0x00,0x05,0x86]
411 // GFX1064: v_cmpx_class_f32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0x31,0x7d,0x00,0x00,0x05,0x86]
77
88 # GFX10: s_mov_b32 s105, s104 ; encoding: [0x68,0x03,0xe9,0xbe]
99 0x68,0x03,0xe9,0xbe
10
11 # GFX10: v_cmp_eq_f32_e64 s105, v0, s105
12 0x69,0x00,0x02,0xd4,0x00,0xd3,0x00,0x00
13
14 # GFX10: v_cmp_eq_f32_sdwa s105, v0, s105 src0_sel:DWORD src1_sel:DWORD
15 0xf9,0xd2,0x04,0x7c,0x00,0xe9,0x06,0x86
0 # RUN: llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX1032 %s
1 # RUN: llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64,-wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX1064 %s
2
3 # GFX1032: v_cmp_lt_f32_e32 vcc_lo, s2, v4
4 # GFX1064: v_cmp_lt_f32_e32 vcc, s2, v4
5 0x02,0x08,0x02,0x7c
6
7 # GFX1032: v_cmp_ge_i32_e64 s2, s0, v2
8 # GFX1064: v_cmp_ge_i32_e64 s[2:3], s0, v2
9 0x02,0x00,0x86,0xd4,0x00,0x04,0x02,0x00
10
11 # GFX1032: v_cmp_ge_i32_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:DWORD
12 # GFX1064: v_cmp_ge_i32_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:DWORD
13 0xf9,0x04,0x0c,0x7d,0x00,0x00,0x05,0x06
14
15 # GFX1032: v_cmp_le_f16_sdwa s0, v3, v4 src0_sel:WORD_1 src1_sel:DWORD
16 # GFX1064: v_cmp_le_f16_sdwa s[0:1], v3, v4 src0_sel:WORD_1 src1_sel:DWORD
17 0xf9,0x08,0x96,0x7d,0x03,0x80,0x05,0x06
18
19 # GFX1032: v_cmp_class_f32_e32 vcc_lo, s0, v0
20 # GFX1064: v_cmp_class_f32_e32 vcc, s0, v0
21 0x00,0x00,0x10,0x7d
22
23 # GFX1032: v_cmp_class_f16_sdwa vcc_lo, v1, v2 src0_sel:DWORD src1_sel:DWORD
24 # GFX1064: v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
25 0xf9,0x04,0x1e,0x7d,0x01,0x00,0x06,0x06
26
27 # GFX1032: v_cmp_class_f16_sdwa s0, v1, v2 src0_sel:DWORD src1_sel:DWORD
28 # GFX1064: v_cmp_class_f16_sdwa s[0:1], v1, v2 src0_sel:DWORD src1_sel:DWORD
29 0xf9,0x04,0x1e,0x7d,0x01,0x80,0x06,0x06
30
31 # GFX1032: v_cndmask_b32_e32 v5, 0, v2, vcc_lo
32 # GFX1064: v_cndmask_b32_e32 v5, 0, v2, vcc ;
33 0x80,0x04,0x0a,0x02
34
35 # GFX1032: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
36 # GFX1064: v_cndmask_b32_e32 v1, v2, v3, vcc ;
37 0x02,0x07,0x02,0x02
38
39 # GFX1032: v_add_co_u32_e64 v2, vcc_lo, s0, v2
40 # GFX1064: v_add_co_u32_e64 v2, vcc, s0, v2
41 0x02,0x6a,0x0f,0xd7,0x00,0x04,0x02,0x00
42
43 # GFX1032: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
44 # GFX1064: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc ;
45 0x03,0x09,0x06,0x50
46
47 # GFX1032: v_sub_co_u32_e64 v2, vcc_lo, s0, v2
48 # GFX1064: v_sub_co_u32_e64 v2, vcc, s0, v2
49 0x02,0x6a,0x10,0xd7,0x00,0x04,0x02,0x00
50
51 # GFX1032: v_subrev_co_u32_e64 v2, vcc_lo, s0, v2
52 # GFX1064: v_subrev_co_u32_e64 v2, vcc, s0, v2
53 0x02,0x6a,0x19,0xd7,0x00,0x04,0x02,0x00
54
55 # GFX1032: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
56 # GFX1064: v_sub_co_ci_u32_e32 v3, vcc, v3, v4, vcc ;
57 0x03,0x09,0x06,0x52
58
59 # GFX1032: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
60 # GFX1064: v_subrev_co_ci_u32_e32 v1, vcc, 0, v1, vcc ;
61 0x80,0x02,0x02,0x54
62
63 # GFX1032: v_add_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
64 # GFX1064: v_add_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
65 0xf9,0x08,0x02,0x50,0x01,0x06,0x00,0x06
66
67 # GFX1032: v_sub_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
68 # GFX1064: v_sub_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
69 0xf9,0x08,0x02,0x52,0x01,0x06,0x00,0x06
70
71 # GFX1032: v_subrev_co_ci_u32_sdwa v1, vcc_lo, v1, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
72 # GFX1064: v_subrev_co_ci_u32_sdwa v1, vcc, v1, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
73 0xf9,0x08,0x02,0x54,0x01,0x06,0x00,0x06
74
75 # GFX1032: v_add_co_ci_u32_sdwa v1, vcc_lo, sext(v1), sext(v4), vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
76 # GFX1064: v_add_co_ci_u32_sdwa v1, vcc, sext(v1), sext(v4), vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
77 0xf9,0x08,0x02,0x50,0x01,0x06,0x08,0x0e
78
79 # GFX1032: v_add_nc_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
80 # GFX1064: v_add_nc_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
81 0xfa,0x04,0x0a,0x4a,0x01,0xe4,0x00,0x00
82
83 # FIXME: Results in invalid v_subrev_u16_dpp which apparently has the same encoding but does not exist in GFX10
84
85 # gfx1032: v_add_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
86 # gfx1064: v_add_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
87 # 0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x00
88
89 # FIXME: Results in v_mul_lo_u16_dpp
90
91 # gfx1032: v_sub_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
92 # gfx1064: v_sub_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
93 # 0xfa,0x04,0x0a,0x52,0x01,0xe4,0x00,0x00
94
95 # FIXME: gives v_lshlrev_b16_dpp
96
97 # gfx1032: v_subrev_co_ci_u32_dpp v5, vcc_lo, v1, v2, vcc_lo quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
98 # gfx1064: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
99 # 0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x00
100
101 # GFX1032: v_add_co_u32_e64 v0, s0, v0, v2
102 # GFX1064: v_add_co_u32_e64 v0, s[0:1], v0, v2
103 0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00
104
105 # GFX1032: v_add_co_ci_u32_e64 v4, s0, v1, v5, s2
106 # GFX1064: v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
107 0x04,0x00,0x28,0xd5,0x01,0x0b,0x0a,0x00
108
109 # GFX1032: v_sub_co_u32_e64 v0, s0, v0, v2
110 # GFX1064: v_sub_co_u32_e64 v0, s[0:1], v0, v2
111 0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00
112
113 # GFX1032: v_sub_co_ci_u32_e64 v4, s0, v1, v5, s2
114 # GFX1064: v_sub_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
115 0x04,0x00,0x29,0xd5,0x01,0x0b,0x0a,0x00
116
117 # GFX1032: v_subrev_co_u32_e64 v0, s0, v0, v2
118 # GFX1064: v_subrev_co_u32_e64 v0, s[0:1], v0, v2
119 0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00
120
121 # GFX1032: v_subrev_co_ci_u32_e64 v4, s0, v1, v5, s2
122 # GFX1064: v_subrev_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
123 0x04,0x00,0x2a,0xd5,0x01,0x0b,0x0a,0x00
124
125 # GFX1032: v_add_co_ci_u32_e64 v4, vcc_lo, v1, v5, s2
126 # GFX1064: v_add_co_ci_u32_e64 v4, vcc, v1, v5, s[2:3]
127 0x04,0x6a,0x28,0xd5,0x01,0x0b,0x0a,0x00
128
129 # GFX1032: v_add_co_ci_u32_e64 v4, s0, v1, v5, vcc_lo
130 # GFX1064: v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, vcc ;
131 0x04,0x00,0x28,0xd5,0x01,0x0b,0xaa,0x01
132
133 # GFX1032: v_div_scale_f32 v2, s2, v0, v0, v2
134 # GFX1064: v_div_scale_f32 v2, s[2:3], v0, v0, v2
135 0x02,0x02,0x6d,0xd5,0x00,0x01,0x0a,0x04
136
137 # GFX1032: v_div_scale_f64 v[2:3], s2, v[0:1], v[0:1], v[2:3]
138 # GFX1064: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], v[2:3]
139 0x02,0x02,0x6e,0xd5,0x00,0x01,0x0a,0x04
140
141 # GFX1032: v_mad_i64_i32 v[0:1], s6, v0, v1, v[2:3]
142 # GFX1064: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
143 0x00,0x06,0x77,0xd5,0x00,0x03,0x0a,0x04
144
145 # GFX1032: v_mad_u64_u32 v[0:1], s6, v0, v1, v[2:3]
146 # GFX1064: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3]
147 0x00,0x06,0x76,0xd5,0x00,0x03,0x0a,0x04
148
149 # GFX1032: v_cmpx_neq_f32_e32 v0, v1
150 # GFX1064: v_cmpx_neq_f32_e32 v0, v1
151 0x00,0x03,0x3a,0x7c
152
153 # GFX1032: v_cmpx_neq_f32_sdwa v0, v1 src0_sel:WORD_1 src1_sel:DWORD
154 # GFX1064: v_cmpx_neq_f32_sdwa v0, v1 src0_sel:WORD_1 src1_sel:DWORD
155 0xf9,0x02,0x3a,0x7c,0x00,0x00,0x05,0x06
156
157 # GFX1032: v_cmpx_class_f32_e64 v0, 1
158 # GFX1064: v_cmpx_class_f32_e64 v0, 1
159 0x00,0x00,0x98,0xd4,0x00,0x03,0x01,0x00
160
161 # GFX1032: v_cmpx_class_f32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD
162 # GFX1064: v_cmpx_class_f32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD
163 0xf9,0x02,0x31,0x7d,0x00,0x00,0x05,0x86