llvm.org GIT mirror llvm / 75c4f68
AMDGPU: Try a lot harder to emit scalar loads This has two main components. First, widen widen short constant loads in DAG when they have the correct alignment. This is already done a bit in AMDGPUCodeGenPrepare, since that has access to DivergenceAnalysis. This can't help kernarg loads created in the DAG. Start to use DAG divergence analysis to help this case. The second part is to avoid kernel argument lowering breaking the alignment of short vector elements because calling convention lowering wants to split everything into legal register types. When loading a split type, load the nearest 4-byte aligned segment and shift to get the desired bits. This extra load of the earlier argument piece ends up merging, and the bit extract hopefully folds out. There are a number of improvements and regressions with this, but I think as-is this is a better compromise between several of the worst parts of SelectionDAG. Particularly when i16 is legal, this produces worse code for i8 and i16 element vector kernel arguments. This is partially due to the very weak load merging the DAG does. It only looks for fairly specific combines between pairs of loads which no longer appear. In particular this causes v4i16 loads to be split into 2 components when previously the two halves were merged. Worse, because of the newly introduced shifts, there is a lot more unnecessary vector packing and unpacking code emitted. At least some of this is due to reporting false for isTypeDesirableForOp for i16 as a workaround for the lack of divergence information in the DAG. The cases where this happens it doesn't actually matter, but the relevant code in SimplifyDemandedBits doens't have the context to know to ignore this. The use of the scalar cache is probably more important than the mess of mostly scalar instructions doing this packing and unpacking. Future work can fix this, possibly by making better use of the new DAG divergence information for controlling promotion decisions, or adding another version of shift + trunc + shift combines that doesn't only know about the used types. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@334180 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 1 year, 8 months ago
29 changed file(s) with 926 addition(s) and 539 deletion(s). Raw diff Collapse all Expand all
4747
4848 namespace {
4949
50 static cl::opt WidenLoads(
51 "amdgpu-codegenprepare-widen-constant-loads",
52 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
53 cl::ReallyHidden,
54 cl::init(true));
55
5056 class AMDGPUCodeGenPrepare : public FunctionPass,
5157 public InstVisitor {
5258 const SISubtarget *ST = nullptr;
471477 }
472478
473479 bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
480 if (!WidenLoads)
481 return false;
482
474483 if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
475484 I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
476485 canWidenScalarExtLoad(I)) {
10801080 PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
10811081 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
10821082
1083
1084 // Try to avoid using an extload by loading earlier than the argument address,
1085 // and extracting the relevant bits. The load should hopefully be merged with
1086 // the previous argument.
1087 if (Align < 4) {
1088 //if (MemVT.getStoreSize() < 4) {
1089 assert(MemVT.getStoreSize() < 4);
1090 int64_t AlignDownOffset = alignDown(Offset, 4);
1091 int64_t OffsetDiff = Offset - AlignDownOffset;
1092
1093 EVT IntVT = MemVT.changeTypeToInteger();
1094
1095 // TODO: If we passed in the base kernel offset we could have a better
1096 // alignment than 4, but we don't really need it.
1097 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1098 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
1099 MachineMemOperand::MODereferenceable |
1100 MachineMemOperand::MOInvariant);
1101
1102 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1103 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1104
1105 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1106 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1107 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1108
1109
1110 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1111 }
1112
10831113 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
10841114 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
10851115 MachineMemOperand::MODereferenceable |
52805310 return Op;
52815311 }
52825312 }
5313 }
5314
5315 static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
5316 ISD::LoadExtType ExtType, SDValue Op,
5317 const SDLoc &SL, EVT VT) {
5318 if (VT.bitsLT(Op.getValueType()))
5319 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
5320
5321 switch (ExtType) {
5322 case ISD::SEXTLOAD:
5323 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
5324 case ISD::ZEXTLOAD:
5325 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
5326 case ISD::EXTLOAD:
5327 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
5328 case ISD::NON_EXTLOAD:
5329 return Op;
5330 }
5331
5332 llvm_unreachable("invalid ext type");
5333 }
5334
5335 SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
5336 SelectionDAG &DAG = DCI.DAG;
5337 if (Ld->getAlignment() < 4 || Ld->isDivergent())
5338 return SDValue();
5339
5340 // FIXME: Constant loads should all be marked invariant.
5341 unsigned AS = Ld->getAddressSpace();
5342 if (AS != AMDGPUASI.CONSTANT_ADDRESS &&
5343 AS != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
5344 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
5345 return SDValue();
5346
5347 // Don't do this early, since it may interfere with adjacent load merging for
5348 // illegal types. We can avoid losing alignment information for exotic types
5349 // pre-legalize.
5350 EVT MemVT = Ld->getMemoryVT();
5351 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
5352 MemVT.getSizeInBits() >= 32)
5353 return SDValue();
5354
5355 SDLoc SL(Ld);
5356
5357 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
5358 "unexpected vector extload");
5359
5360 // TODO: Drop only high part of range.
5361 SDValue Ptr = Ld->getBasePtr();
5362 SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
5363 MVT::i32, SL, Ld->getChain(), Ptr,
5364 Ld->getOffset(),
5365 Ld->getPointerInfo(), MVT::i32,
5366 Ld->getAlignment(),
5367 Ld->getMemOperand()->getFlags(),
5368 Ld->getAAInfo(),
5369 nullptr); // Drop ranges
5370
5371 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
5372 if (MemVT.isFloatingPoint()) {
5373 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
5374 "unexpected fp extload");
5375 TruncVT = MemVT.changeTypeToInteger();
5376 }
5377
5378 SDValue Cvt = NewLoad;
5379 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
5380 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
5381 DAG.getValueType(TruncVT));
5382 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
5383 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
5384 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
5385 } else {
5386 assert(Ld->getExtensionType() == ISD::EXTLOAD);
5387 }
5388
5389 EVT VT = Ld->getValueType(0);
5390 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
5391
5392 DCI.AddToWorklist(Cvt.getNode());
5393
5394 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
5395 // the appropriate extension from the 32-bit load.
5396 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
5397 DCI.AddToWorklist(Cvt.getNode());
5398
5399 // Handle conversion back to floating point if necessary.
5400 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
5401
5402 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
52835403 }
52845404
52855405 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
71827302 return performMinMaxCombine(N, DCI);
71837303 break;
71847304 }
7185 case ISD::LOAD:
7305 case ISD::LOAD: {
7306 if (SDValue Widended = widenLoad(cast(N), DCI))
7307 return Widended;
7308 LLVM_FALLTHROUGH;
7309 }
71867310 case ISD::STORE:
71877311 case ISD::ATOMIC_LOAD:
71887312 case ISD::ATOMIC_STORE:
4545 SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
4646 SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
4747 SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
48
49 SDValue widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const;
4850 SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
4951 SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
5052 SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
175175 ret void
176176 }
177177
178 ; FIXME: Should use SGPRs
179178 ; FUNC-LABEL: {{^}}s_and_i1:
180 ; SI: v_and_b32
179 ; SI: s_load_dword [[LOAD:s[0-9]+]]
180 ; SI: s_lshr_b32 [[B_SHIFT:s[0-9]+]], [[LOAD]], 8
181 ; SI: s_and_b32 [[AND:s[0-9]+]], [[LOAD]], [[B_SHIFT]]
182 ; SI: s_and_b32 [[AND_TRUNC:s[0-9]+]], [[AND]], 1{{$}}
183 ; SI: v_mov_b32_e32 [[V_AND_TRUNC:v[0-9]+]], [[AND_TRUNC]]
184 ; SI: buffer_store_byte [[V_AND_TRUNC]]
181185 define amdgpu_kernel void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
182186 %and = and i1 %a, %b
183187 store i1 %and, i1 addrspace(1)* %out
11 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
22 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
33
4 ; FIXME: Should be same on CI/VI
45 ; GCN-LABEL: {{^}}s_ashr_v2i16:
56 ; GFX9: s_load_dword [[LHS:s[0-9]+]]
67 ; GFX9: s_load_dword [[RHS:s[0-9]+]]
78 ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
89 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
910
10 ; VI: s_load_dword [[LHS:s[0-9]+]]
11 ; VI: s_load_dword [[RHS:s[0-9]+]]
11 ; CIVI: s_load_dword [[LHS:s[0-9]+]]
12 ; CIVI: s_load_dword [[RHS:s[0-9]+]]
13
1214 ; VI: s_ashr_i32
1315 ; VI: s_ashr_i32
1416 ; VI: s_sext_i32_i16
1921 ; VI: s_and_b32
2022 ; VI: s_or_b32
2123
22 ; CI-DAG: v_ashrrev_i32_e32
23 ; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
24 ; CI-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
25 ; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
26 ; CI: v_or_b32_e32
24 ; CI: s_ashr_i32
25 ; CI: s_and_b32
26 ; CI: s_lshr_b32
27 ; CI: s_sext_i32_i16
28 ; CI: s_ashr_i32
29 ; CI: s_ashr_i32
30 ; CI: s_lshl_b32
31 ; CI: s_and_b32
2732 define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
2833 %result = ashr <2 x i16> %lhs, %rhs
2934 store <2 x i16> %result, <2 x i16> addrspace(1)* %out
None ; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
1 ; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
0 ; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
1 ; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s
44
55 ; GCN-LABEL: {{^}}test_branch:
66 ; GCNNOOPT: v_writelane_b32
2727 }
2828
2929 ; GCN-LABEL: {{^}}test_brcc_i1:
30 ; GCN: buffer_load_ubyte
31 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 1,
32 ; GCN: v_cmp_eq_u32_e32 vcc,
33 ; GCN: s_cbranch_vccnz [[END:BB[0-9]+_[0-9]+]]
30 ; GCN: s_load_dword [[VAL:s[0-9]+]]
31 ; GCNNOOPT: s_and_b32 s{{[0-9]+}}, 1, [[VAL]]
32 ; GCNOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], 1
33 ; GCN: s_cmp_eq_u32
34 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
3435
3536 ; GCN: buffer_store_dword
3637
104104 ; GCN: s_cbranch_vccnz [[LOOPBB]]
105105 ; GCN-NEXT: ; %bb.2
106106 ; GCN-NEXT: s_endpgm
107 define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {
107 define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n) nounwind {
108108 entry:
109 %cond = load volatile i1, i1 addrspace(3)* null
109110 br label %for.body
110111
111112 for.exit:
5757 }
5858
5959 ; GCN-LABEL: {{^}}extract_vector_elt_v3f16:
60 ; GCN: buffer_load_ushort
61 ; GCN: buffer_store_short
62 ; GCN: buffer_store_short
60 ; GCN: s_load_dword s
61 ; GCN: s_load_dword s
6362 define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 {
6463 %p0 = extractelement <3 x half> %foo, i32 0
6564 %p1 = extractelement <3 x half> %foo, i32 2
6968 ret void
7069 }
7170
71 ; FIXME: Why sometimes vector shift?
7272 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16:
73 ; SICIVI: buffer_load_ushort
74 ; SICIVI: buffer_load_ushort
75 ; SICIVI: buffer_load_ushort
73 ; GCN: s_load_dword s
74 ; GCN: s_load_dword s
75 ; GCN: s_load_dword s
7676
7777 ; GFX9-DAG: global_load_short_d16_hi v
7878 ; GFX9-DAG: global_load_short_d16 v
8080 ; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
8181 ; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v
8282
83 ; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
83 ; SI: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
8484
8585 ; GCN: {{buffer|global}}_store_short
8686 define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
None ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI %s
1 ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SICIVI,GFX89 %s
0 ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
1 ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,GFX89 %s
22 ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
33
44 ; GCN-LABEL: {{^}}extract_vector_elt_v2i16:
5757 }
5858
5959 ; GCN-LABEL: {{^}}extract_vector_elt_v3i16:
60 ; GCN: buffer_load_ushort
60 ; GCN: s_load_dword s
61 ; GCN: s_load_dword s
6162 ; GCN: buffer_store_short
6263 ; GCN: buffer_store_short
6364 define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 {
7071 }
7172
7273 ; GCN-LABEL: {{^}}extract_vector_elt_v4i16:
73 ; SICI: buffer_load_ushort
74 ; SICI: buffer_load_ushort
75 ; SICI: buffer_store_short
76 ; SICI: buffer_store_short
74 ; SI: s_load_dword s
75 ; SI: s_load_dword s
76 ; SI: buffer_store_short
77 ; SI: buffer_store_short
7778
7879 ; VI: s_load_dword s
7980 ; VI: s_load_dword s
9697 }
9798
9899 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:
99 ; SICI: buffer_load_ushort
100 ; SICI: buffer_load_ushort
101 ; SICI: buffer_load_ushort
100 ; GCN: s_load_dword s
101 ; GCN: s_load_dword s
102 ; GCN: s_load_dword s
103 ; GCN-NOT: {{buffer|flat|global}}
102104
103 ; SICI: buffer_store_short
104 ; SICI: buffer_store_short
105 ; SICI: buffer_store_short
106
107 ; SICI: buffer_load_ushort
108 ; SICI: buffer_store_short
109
110 ; GFX9-DAG: global_load_short_d16_hi v
111 ; GFX9-DAG: global_load_short_d16 v
105 ; FIXME: Unnecessary repacking
106 ; GFX9: s_pack_ll_b32_b16
107 ; GFX9: s_pack_lh_b32_b16
112108
113109 ; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
114 ; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v
115110
116 ; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
111
112 ; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s
117113
118114 ; GCN: {{buffer|global}}_store_short
119115 define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
11 ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
22
33 ; GCN-LABEL: {{^}}extract_vector_elt_v1i8:
4 ; GCN: buffer_load_ubyte
5 ; GCN: buffer_store_byte
4 ; GCN: s_load_dword [[LOAD:s[0-9]+]]
5 ; GCN: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
6 ; GCN: buffer_store_byte [[V_LOAD]]
67 define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
78 %p0 = extractelement <1 x i8> %foo, i32 0
89 store i8 %p0, i8 addrspace(1)* %out
1011 }
1112
1213 ; GCN-LABEL: {{^}}extract_vector_elt_v2i8:
13 ; GCN: buffer_load_ubyte
14 ; GCN: buffer_load_ubyte
14 ; GCN: s_load_dword s
15 ; GCN-NOT: {{flat|buffer|global}}
16 ; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
17 ; GCN-NOT: {{flat|buffer|global}}
1518 ; GCN: buffer_store_byte
1619 ; GCN: buffer_store_byte
1720 define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
2427 }
2528
2629 ; GCN-LABEL: {{^}}extract_vector_elt_v3i8:
27 ; GCN: buffer_load_ubyte
28 ; GCN: buffer_load_ubyte
30 ; GCN: s_load_dword s
31 ; GCN-NOT: {{flat|buffer|global}}
32 ; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
33 ; GCN-NOT: {{flat|buffer|global}}
2934 ; GCN: buffer_store_byte
3035 ; GCN: buffer_store_byte
3136 define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
3843 }
3944
4045 ; GCN-LABEL: {{^}}extract_vector_elt_v4i8:
41 ; GCN: buffer_load_ubyte
42 ; GCN: buffer_load_ubyte
46 ; GCN: s_load_dword s
47 ; GCN-NOT: {{flat|buffer|global}}
48 ; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
49 ; GCN-NOT: {{flat|buffer|global}}
4350 ; GCN: buffer_store_byte
4451 ; GCN: buffer_store_byte
4552 define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
5259 }
5360
5461 ; GCN-LABEL: {{^}}extract_vector_elt_v8i8:
55 ; GCN: buffer_load_ubyte
56 ; GCN: buffer_load_ubyte
62 ; GCN: s_load_dword [[VAL:s[0-9]+]]
63 ; GCN-NOT: {{flat|buffer|global}}
64 ; GCN: s_lshr_b32 s{{[0-9]+}}, [[VAL]], 16
65 ; GCN-NOT: {{flat|buffer|global}}
5766 ; GCN: buffer_store_byte
5867 ; GCN: buffer_store_byte
5968 define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
6675 }
6776
6877 ; GCN-LABEL: {{^}}extract_vector_elt_v16i8:
69 ; GCN: buffer_load_ubyte
70 ; GCN: buffer_load_ubyte
71 ; GCN: buffer_store_byte
72 ; GCN: buffer_store_byte
78 ; GCN: s_load_dword [[LOAD0:s[0-9]+]]
79 ; GCN-NOT: {{flat|buffer|global}}
80 ; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16
81 ; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]]
82 ; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
83 ; GCN: buffer_store_byte [[V_ELT2]]
84 ; GCN: buffer_store_byte [[V_LOAD0]]
7385 define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 {
7486 %p0 = extractelement <16 x i8> %foo, i32 0
7587 %p1 = extractelement <16 x i8> %foo, i32 2
8092 }
8193
8294 ; GCN-LABEL: {{^}}extract_vector_elt_v32i8:
83 ; GCN: buffer_load_ubyte
84 ; GCN: buffer_load_ubyte
85 ; GCN: buffer_store_byte
86 ; GCN: buffer_store_byte
95 ; GCN: s_load_dword [[LOAD0:s[0-9]+]]
96 ; GCN-NOT: {{flat|buffer|global}}
97 ; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16
98 ; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]]
99 ; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
100 ; GCN: buffer_store_byte [[V_ELT2]]
101 ; GCN: buffer_store_byte [[V_LOAD0]]
87102 define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
88103 %p0 = extractelement <32 x i8> %foo, i32 0
89104 %p1 = extractelement <32 x i8> %foo, i32 2
94109 }
95110
96111 ; GCN-LABEL: {{^}}extract_vector_elt_v64i8:
97 ; GCN: buffer_load_ubyte
98 ; GCN: buffer_load_ubyte
99 ; GCN: buffer_store_byte
100 ; GCN: buffer_store_byte
112 ; GCN: s_load_dword [[LOAD0:s[0-9]+]]
113 ; GCN-NOT: {{flat|buffer|global}}
114 ; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16
115 ; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]]
116 ; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
117 ; GCN: buffer_store_byte [[V_ELT2]]
118 ; GCN: buffer_store_byte [[V_LOAD0]]
101119 define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 {
102120 %p0 = extractelement <64 x i8> %foo, i32 0
103121 %p1 = extractelement <64 x i8> %foo, i32 2
109127
110128 ; FIXME: SI generates much worse code from that's a pain to match
111129
130 ; FIXME: 16-bit and 32-bit shift not combined after legalize to to
131 ; isTypeDesirableForOp in SimplifyDemandedBits
132
112133 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8:
113 ; VI-DAG: buffer_load_ushort [[LOAD:v[0-9]+]],
114 ; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
115
116 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
117 ; VI: v_lshrrev_b16_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[LOAD]]
134 ; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c
135 ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
136 ; VI-NOT: {{flat|buffer|global}}
137 ; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8
138 ; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]]
139 ; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
140 ; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[ELT0]], [[ELT2]]
141 ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
142 ; VI: v_lshrrev_b16_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[BUILD_VEC]]
118143 ; VI: buffer_store_byte [[EXTRACT]]
119144 define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo, i32 %idx) #0 {
120145 %elt = extractelement <2 x i8> %foo, i32 %idx
123148 }
124149
125150 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8:
126 ; VI-DAG: buffer_load_ubyte [[LOAD2:v[0-9]+]],
127 ; VI-DAG: buffer_load_ushort [[LOAD01:v[0-9]+]],
128 ; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
129
151 ; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c
152 ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
153 ; VI-NOT: {{flat|buffer|global}}
154 ; VI: s_lshr_b32 [[ELT12:s[0-9]+]], [[LOAD]], 8
155 ; VI: v_lshlrev_b16_e64 [[ELT1:v[0-9]+]], 8, [[ELT12]]
156 ; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
157 ; VI: v_or_b32_e32 [[VEC3:v[0-9]+]], [[ELT0]], [[ELT1]]
130158 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
131
132 ; VI: v_lshlrev_b32_e32 [[ELT2:v[0-9]+]], 16, [[LOAD2]]
133 ; VI: v_or_b32_e32 [[VEC3:v[0-9]+]], [[LOAD01]], [[ELT2]]
134159 ; VI: v_lshrrev_b32_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[VEC3]]
135160 ; VI: buffer_store_byte [[EXTRACT]]
136161 define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
141166 }
142167
143168 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8:
144 ; VI-DAG: s_load_dword [[VEC3:s[0-9]+]], s[0:1], 0x2c
145 ; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
169 ; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34
170 ; VI: s_load_dword [[VEC4:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
146171
147172 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
148 ; VI: s_lshr_b32 [[EXTRACT:s[0-9]+]], [[VEC3]], [[SCALED_IDX]]
173 ; VI: s_lshr_b32 [[EXTRACT:s[0-9]+]], [[VEC4]], [[SCALED_IDX]]
174
149175 ; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], [[EXTRACT]]
150176 ; VI: buffer_store_byte [[V_EXTRACT]]
151 define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 {
152 %p0 = extractelement <4 x i8> %foo, i32 %idx
177 define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, i32 %idx) #0 {
178 %vec = load <4 x i8>, <4 x i8> addrspace(4)* %vec.ptr
179 %p0 = extractelement <4 x i8> %vec, i32 %idx
153180 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
154181 store i8 %p0, i8 addrspace(1)* %out
155182 ret void
156183 }
157184
158185 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8:
159 ; VI-DAG: s_load_dwordx2 [[VEC3:s\[[0-9]+:[0-9]+\]]], s[0:1], 0x2c
160 ; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34
186 ; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34
187 ; VI: s_load_dwordx2 [[VEC8:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
161188
162189 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
163 ; VI: s_lshr_b64 s{{\[}}[[EXTRACT_LO:[0-9]+]]:{{[0-9]+\]}}, [[VEC3]], [[SCALED_IDX]]
190 ; VI: s_lshr_b64 s{{\[}}[[EXTRACT_LO:[0-9]+]]:{{[0-9]+\]}}, [[VEC8]], [[SCALED_IDX]]
164191 ; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], s[[EXTRACT_LO]]
165192 ; VI: buffer_store_byte [[V_EXTRACT]]
166 define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo, i32 %idx) #0 {
167 %p0 = extractelement <8 x i8> %foo, i32 %idx
193 define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> addrspace(4)* %vec.ptr, i32 %idx) #0 {
194 %vec = load <8 x i8>, <8 x i8> addrspace(4)* %vec.ptr
195 %p0 = extractelement <8 x i8> %vec, i32 %idx
168196 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
169197 store i8 %p0, i8 addrspace(1)* %out
170198 ret void
None ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
0 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
33
44 ; DAGCombiner will transform:
55 ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
66 ; unless isFabsFree returns true
77
88 ; GCN-LABEL: {{^}}s_fabs_free_f16:
9 ; GCN: {{flat|global}}_load_ushort [[VAL:v[0-9]+]],
10 ; GCN: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]]
11 ; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
12
9 ; GCN: s_load_dword [[VAL:s[0-9]+]]
10
11 ; CI: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff
12 ; CI: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]]
13 ; CI: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
14
15 ; GFX89: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff
16 ; GFX89: v_and_b32_e32 [[V_RESULT:v[0-9]+]], [[VAL]], [[MASK]]
17 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
1318 define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
1419 %bc= bitcast i16 %in to half
1520 %fabs = call half @llvm.fabs.f16(half %bc)
1823 }
1924
2025 ; GCN-LABEL: {{^}}s_fabs_f16:
21 ; CI: flat_load_ushort [[VAL:v[0-9]+]],
22 ; CI: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]]
23 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
26 ; GCN: s_load_dword [[VAL:s[0-9]+]]
27
28 ; CI: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff
29 ; CI: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]]
30 ; CI: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
31
32 ; GFX89: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff
33 ; GFX89: v_and_b32_e32 [[V_RESULT:v[0-9]+]], [[VAL]], [[MASK]]
34 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
2435 define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) {
2536 %fabs = call half @llvm.fabs.f16(half %in)
2637 store half %fabs, half addrspace(1)* %out
4253 ; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff
4354 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
4455 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
45
4656 ; GCN: {{flat|global}}_store_dwordx2
4757 define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
4858 %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
5161 }
5262
5363 ; GCN-LABEL: {{^}}fabs_fold_f16:
54 ; GCN: {{flat|global}}_load_ushort [[IN0:v[0-9]+]]
55 ; GCN: {{flat|global}}_load_ushort [[IN1:v[0-9]+]]
56
57 ; CI-DAG: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[IN0]]
58 ; CI-DAG: v_cvt_f32_f16_e64 [[ABS_CVT1:v[0-9]+]], |[[IN1]]|
59 ; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[ABS_CVT1]], [[CVT0]]
60 ; CI: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]]
64 ; GCN: s_load_dword [[IN0:s[0-9]+]]
65 ; GCN: s_lshr_b32 [[IN1:s[0-9]+]], [[IN0]], 16
66
67 ; CI-DAG: v_cvt_f32_f16_e64 [[CVT0:v[0-9]+]], |[[IN0]]|
68 ; CI-DAG: v_cvt_f32_f16_e32 [[ABS_CVT1:v[0-9]+]], [[IN1]]
69 ; CI-DAG: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[CVT0]], [[ABS_CVT1]]
70 ; CI-DAG: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]]
6171 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]]
6272
63 ; VI-NOT: and
64 ; VI: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN1]]|, [[IN0]]
65 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
73 ; GFX89: v_mov_b32_e32 [[V_IN1:v[0-9]+]], [[IN1]]
74 ; GFX89: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN0]]|, [[V_IN1]]
75 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
6676 define amdgpu_kernel void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) {
6777 %fabs = call half @llvm.fabs.f16(half %in0)
6878 %fmul = fmul half %fabs, %in1
22 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=GCN %s
33
44 ; GCN-LABEL: {{^}}fneg_fabs_fadd_f16:
5 ; CI: v_cvt_f32_f16_e32
6 ; CI: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |v{{[0-9]+}}|
5 ; CI-DAG: v_cvt_f32_f16_e32
6 ; CI-DAG: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |s{{[0-9]+}}|
77 ; CI: v_sub_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_ABS_X]]
88
99 ; GFX89-NOT: _and
10 ; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|
10 ; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}|
1111 define amdgpu_kernel void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) {
1212 %fabs = call half @llvm.fabs.f16(half %x)
1313 %fsub = fsub half -0.0, %fabs
1818
1919 ; GCN-LABEL: {{^}}fneg_fabs_fmul_f16:
2020 ; CI-DAG: v_cvt_f32_f16_e32
21 ; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{v[0-9]+}}|
21 ; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{s[0-9]+}}|
2222 ; CI: v_mul_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, [[CVT_NEG_ABS_X]]
2323 ; CI: v_cvt_f16_f32_e32
2424
2525 ; GFX89-NOT: _and
26 ; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{v[0-9]+}}, -|{{v[0-9]+}}|
26 ; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{s[0-9]+}}, -|{{v[0-9]+}}|
2727 ; GFX89-NOT: [[MUL]]
2828 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
2929 define amdgpu_kernel void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
3939 ; unless isFabsFree returns true
4040
4141 ; GCN-LABEL: {{^}}fneg_fabs_free_f16:
42 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
42 ; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x8000
4343 define amdgpu_kernel void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
4444 %bc = bitcast i16 %in to half
4545 %fabs = call half @llvm.fabs.f16(half %bc)
4949 }
5050
5151 ; GCN-LABEL: {{^}}fneg_fabs_f16:
52 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
52 ; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x8000
5353 define amdgpu_kernel void @fneg_fabs_f16(half addrspace(1)* %out, half %in) {
5454 %fabs = call half @llvm.fabs.f16(half %in)
5555 %fsub = fsub half -0.0, %fabs
2727 ret void
2828 }
2929
30 ; GCN-LABEL: {{^}}fneg_free_f16:
31 ; GCN: {{flat|global}}_load_ushort [[NEG_VALUE:v[0-9]+]],
30 ; GCN-LABEL: {{^}}s_fneg_free_f16:
31 ; GCN: s_load_dword [[NEG_VALUE:s[0-9]+]],
3232
33 ; XCI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}}
34 ; CI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[NEG_VALUE]]
35 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
36 define amdgpu_kernel void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {
33 ; CI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}}
34 ; CI: v_mov_b32_e32 [[V_XOR:v[0-9]+]], [[XOR]]
35 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_XOR]]
36
37 ; GFX89: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x8000
38 ; GFX89: v_xor_b32_e32 [[XOR:v[0-9]+]], [[NEG_VALUE]], [[MASK]]
39 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
40 define amdgpu_kernel void @s_fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {
3741 %bc = bitcast i16 %in to half
3842 %fsub = fsub half -0.0, %bc
3943 store half %fsub, half addrspace(1)* %out
33 ; half args should be promoted to float for SI and lower.
44
55 ; GCN-LABEL: {{^}}load_f16_arg:
6 ; GCN: flat_load_ushort [[ARG:v[0-9]+]]
7 ; GCN-NOT: [[ARG]]
8 ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ARG]]
6 ; GCN: s_load_dword [[ARG:s[0-9]+]]
7 ; GCN: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]]
8 ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_ARG]]
99 define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
1010 store half %arg, half addrspace(1)* %out
1111 ret void
2121 }
2222
2323 ; GCN-LABEL: {{^}}load_v3f16_arg:
24 ; GCN: flat_load_ushort
25 ; GCN: s_load_dword s
24 ; GCN: s_load_dword s
25 ; GCN: s_load_dword s
26 ; GCN-NOT: {buffer|flat|global}}_load_
2627
2728 ; GCN-NOT: _load
2829 ; GCN-DAG: _store_dword
7576 }
7677
7778 ; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg:
78 ; GCN: flat_load_ushort
79 ; GCN: flat_load_ushort
80 ; GCN: flat_load_ushort
81 ; GCN-NOT: {{buffer|flat|global}}_load
79 ; GCN: s_load_dword s
80 ; GCN: s_load_dword s
81 ; GCN-NOT: _load
8282 ; GCN: v_cvt_f32_f16_e32
8383 ; GCN: v_cvt_f32_f16_e32
8484 ; GCN: v_cvt_f32_f16_e32
100100 }
101101
102102 ; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
103 ; SI: flat_load_ushort
104 ; SI: flat_load_ushort
105 ; SI: flat_load_ushort
106 ; SI: flat_load_ushort
107 ; SI: flat_load_ushort
108 ; SI: flat_load_ushort
109 ; SI: flat_load_ushort
110 ; SI: flat_load_ushort
111
112
113 ; VI: s_load_dword s
114 ; VI: s_load_dword s
115 ; VI: s_load_dword s
116 ; VI: s_load_dword s
103 ; GCN: s_load_dword s
104 ; GCN: s_load_dword s
105 ; GCN: s_load_dword s
106 ; GCN: s_load_dword s
117107
118108 ; GCN: v_cvt_f32_f16_e32
119109 ; GCN: v_cvt_f32_f16_e32
133123 }
134124
135125 ; GCN-LABEL: {{^}}extload_f16_to_f64_arg:
136 ; GCN: flat_load_ushort [[ARG:v[0-9]+]]
126 ; GCN: s_load_dword [[ARG:s[0-9]+]]
137127 ; GCN: v_cvt_f32_f16_e32 v[[ARG_F32:[0-9]+]], [[ARG]]
138128 ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[ARG_F32]]
139129 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
144134 }
145135
146136 ; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
147 ; SI-DAG: flat_load_ushort v
148 ; SI-DAG: flat_load_ushort v
149
150 ; VI-DAG: s_load_dword s
151 ; VI: s_lshr_b32
137 ; GCN: s_load_dword
138 ; GCN: s_lshr_b32
152139
153140 ; GCN-DAG: v_cvt_f32_f16_e32
154141 ; GCN-DAG: v_cvt_f32_f16_e32
162149 }
163150
164151 ; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg:
165 ; GCN-DAG: flat_load_ushort v
166 ; GCN-DAG: flat_load_ushort v
167 ; GCN-DAG: flat_load_ushort v
152 ; GCN: s_load_dword
153 ; GCN: s_load_dword
154 ; GCN: s_lshr_b32
155
168156 ; GCN-DAG: v_cvt_f32_f16_e32
169157 ; GCN-DAG: v_cvt_f32_f16_e32
170158 ; GCN-DAG: v_cvt_f32_f16_e32
179167 }
180168
181169 ; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
182 ; SI: flat_load_ushort v
183 ; SI: flat_load_ushort v
184 ; SI: flat_load_ushort v
185 ; SI: flat_load_ushort v
186
187 ; VI: s_load_dword s
188 ; VI: s_load_dword s
170 ; GCN: s_load_dword s
171 ; GCN: s_load_dword s
189172
190173 ; GCN-DAG: v_cvt_f32_f16_e32
191174 ; GCN-DAG: v_cvt_f32_f16_e32
203186 }
204187
205188 ; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
206 ; SI: flat_load_ushort v
207 ; SI: flat_load_ushort v
208 ; SI: flat_load_ushort v
209 ; SI: flat_load_ushort v
210
211 ; SI: flat_load_ushort v
212 ; SI: flat_load_ushort v
213 ; SI: flat_load_ushort v
214 ; SI: flat_load_ushort v
215
216
217 ; VI: s_load_dword s
218 ; VI: s_load_dword s
219 ; VI: s_load_dword s
220 ; VI: s_load_dword s
221
222
223
224 ; GCN-DAG: v_cvt_f32_f16_e32
225 ; GCN-DAG: v_cvt_f32_f16_e32
226 ; GCN-DAG: v_cvt_f32_f16_e32
227 ; GCN-DAG: v_cvt_f32_f16_e32
228
229 ; GCN-DAG: v_cvt_f32_f16_e32
230 ; GCN-DAG: v_cvt_f32_f16_e32
231 ; GCN-DAG: v_cvt_f32_f16_e32
232 ; GCN-DAG: v_cvt_f32_f16_e32
233
234 ; GCN-DAG: v_cvt_f64_f32_e32
235 ; GCN-DAG: v_cvt_f64_f32_e32
236 ; GCN-DAG: v_cvt_f64_f32_e32
237 ; GCN-DAG: v_cvt_f64_f32_e32
238
239 ; GCN-DAG: v_cvt_f64_f32_e32
240 ; GCN-DAG: v_cvt_f64_f32_e32
241 ; GCN-DAG: v_cvt_f64_f32_e32
242 ; GCN-DAG: v_cvt_f64_f32_e32
189 ; GCN: s_load_dword s
190 ; GCN-NEXT: s_load_dword s
191 ; GCN-NEXT: s_load_dword s
192 ; GCN-NEXT: s_load_dword s
193 ; GCN-NOT: _load_
194
195 ; GCN-DAG: v_cvt_f32_f16_e32
196 ; GCN-DAG: v_cvt_f32_f16_e32
197 ; GCN-DAG: v_cvt_f32_f16_e32
198 ; GCN-DAG: v_cvt_f32_f16_e32
199
200 ; GCN-DAG: v_cvt_f32_f16_e32
201 ; GCN-DAG: v_cvt_f32_f16_e32
202 ; GCN-DAG: v_cvt_f32_f16_e32
203 ; GCN-DAG: v_cvt_f32_f16_e32
204
205 ; GCN-DAG: v_cvt_f64_f32_e32
206 ; GCN-DAG: v_cvt_f64_f32_e32
207 ; GCN-DAG: v_cvt_f64_f32_e32
208 ; GCN-DAG: v_cvt_f64_f32_e32
209
210 ; GCN: v_cvt_f64_f32_e32
211 ; GCN: v_cvt_f64_f32_e32
212 ; GCN: v_cvt_f64_f32_e32
213 ; GCN: v_cvt_f64_f32_e32
243214
244215 ; GCN: s_endpgm
245216 define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
123123 }
124124
125125 ; GCN-LABEL: {{^}}add_inline_imm_0.0_f16:
126 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
127 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0, [[VAL]]{{$}}
126 ; VI: s_load_dword [[VAL:s[0-9]+]]
127 ; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 0{{$}}
128128 ; VI: buffer_store_short [[REG]]
129129 define amdgpu_kernel void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) {
130130 %y = fadd half %x, 0.0
133133 }
134134
135135 ; GCN-LABEL: {{^}}add_inline_imm_0.5_f16:
136 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
137 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}}
136 ; VI: s_load_dword [[VAL:s[0-9]+]]
137 ; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 0.5{{$}}
138138 ; VI: buffer_store_short [[REG]]
139139 define amdgpu_kernel void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) {
140140 %y = fadd half %x, 0.5
143143 }
144144
145145 ; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_f16:
146 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
147 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}}
146 ; VI: s_load_dword [[VAL:s[0-9]+]]
147 ; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], -0.5{{$}}
148148 ; VI: buffer_store_short [[REG]]
149149 define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) {
150150 %y = fadd half %x, -0.5
153153 }
154154
155155 ; GCN-LABEL: {{^}}add_inline_imm_1.0_f16:
156 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
157 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}}
156 ; VI: s_load_dword [[VAL:s[0-9]+]]
157 ; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 1.0{{$}}
158158 ; VI: buffer_store_short [[REG]]
159159 define amdgpu_kernel void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) {
160160 %y = fadd half %x, 1.0
163163 }
164164
165165 ; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_f16:
166 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
167 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}}
166 ; VI: s_load_dword [[VAL:s[0-9]+]]
167 ; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], -1.0{{$}}
168168 ; VI: buffer_store_short [[REG]]
169169 define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) {
170170 %y = fadd half %x, -1.0
173173 }
174174
175175 ; GCN-LABEL: {{^}}add_inline_imm_2.0_f16:
176 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
177 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}}
176 ; VI: s_load_dword [[VAL:s[0-9]+]]
177 ; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 2.0{{$}}
178178 ; VI: buffer_store_short [[REG]]
179179 define amdgpu_kernel void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) {
180180 %y = fadd half %x, 2.0
183183 }
184184
185185 ; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_f16:
186 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
187 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}}
186 ; VI: s_load_dword [[VAL:s[0-9]+]]
187 ; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], -2.0{{$}}
188188 ; VI: buffer_store_short [[REG]]
189189 define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) {
190190 %y = fadd half %x, -2.0
193193 }
194194
195195 ; GCN-LABEL: {{^}}add_inline_imm_4.0_f16:
196 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
197 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}}
196 ; VI: s_load_dword [[VAL:s[0-9]+]]
197 ; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 4.0{{$}}
198198 ; VI: buffer_store_short [[REG]]
199199 define amdgpu_kernel void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) {
200200 %y = fadd half %x, 4.0
203203 }
204204
205205 ; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_f16:
206 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
207 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}}
206 ; VI: s_load_dword [[VAL:s[0-9]+]]
207 ; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], -4.0{{$}}
208208 ; VI: buffer_store_short [[REG]]
209209 define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) {
210210 %y = fadd half %x, -4.0
235235 }
236236
237237 ; GCN-LABEL: {{^}}add_inline_imm_1_f16:
238 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
239 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1, [[VAL]]{{$}}
238 ; VI: s_load_dword [[VAL:s[0-9]+]]
239 ; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 1{{$}}
240240 ; VI: buffer_store_short [[REG]]
241241 define amdgpu_kernel void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) {
242242 %y = fadd half %x, 0xH0001
245245 }
246246
247247 ; GCN-LABEL: {{^}}add_inline_imm_2_f16:
248 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
249 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2, [[VAL]]{{$}}
248 ; VI: s_load_dword [[VAL:s[0-9]+]]
249 ; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 2{{$}}
250250 ; VI: buffer_store_short [[REG]]
251251 define amdgpu_kernel void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) {
252252 %y = fadd half %x, 0xH0002
255255 }
256256
257257 ; GCN-LABEL: {{^}}add_inline_imm_16_f16:
258 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
259 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 16, [[VAL]]{{$}}
258 ; VI: s_load_dword [[VAL:s[0-9]+]]
259 ; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 16{{$}}
260260 ; VI: buffer_store_short [[REG]]
261261 define amdgpu_kernel void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) {
262262 %y = fadd half %x, 0xH0010
267267 ; GCN-LABEL: {{^}}add_inline_imm_neg_1_f16:
268268 ; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, -1
269269 ; VI: buffer_store_short [[REG]]
270 define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) {
271 %xbc = bitcast half %x to i16
272 %y = add i16 %xbc, -1
270 define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) {
271 %x = load i16, i16 addrspace(1)* %in
272 %y = add i16 %x, -1
273273 %ybc = bitcast i16 %y to half
274274 store half %ybc, half addrspace(1)* %out
275275 ret void
278278 ; GCN-LABEL: {{^}}add_inline_imm_neg_2_f16:
279279 ; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, 0xfffe
280280 ; VI: buffer_store_short [[REG]]
281 define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) {
282 %xbc = bitcast half %x to i16
283 %y = add i16 %xbc, -2
281 define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) {
282 %x = load i16, i16 addrspace(1)* %in
283 %y = add i16 %x, -2
284284 %ybc = bitcast i16 %y to half
285285 store half %ybc, half addrspace(1)* %out
286286 ret void
289289 ; GCN-LABEL: {{^}}add_inline_imm_neg_16_f16:
290290 ; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, 0xfff0
291291 ; VI: buffer_store_short [[REG]]
292 define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) {
293 %xbc = bitcast half %x to i16
294 %y = add i16 %xbc, -16
292 define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) {
293 %x = load i16, i16 addrspace(1)* %in
294 %y = add i16 %x, -16
295295 %ybc = bitcast i16 %y to half
296296 store half %ybc, half addrspace(1)* %out
297297 ret void
298298 }
299299
300300 ; GCN-LABEL: {{^}}add_inline_imm_63_f16:
301 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
302 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 63, [[VAL]]
301 ; VI: s_load_dword [[VAL:s[0-9]+]]
302 ; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 63
303303 ; VI: buffer_store_short [[REG]]
304304 define amdgpu_kernel void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) {
305305 %y = fadd half %x, 0xH003F
308308 }
309309
310310 ; GCN-LABEL: {{^}}add_inline_imm_64_f16:
311 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
312 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 64, [[VAL]]
311 ; VI: s_load_dword [[VAL:s[0-9]+]]
312 ; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 64
313313 ; VI: buffer_store_short [[REG]]
314314 define amdgpu_kernel void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) {
315315 %y = fadd half %x, 0xH0040
201201 }
202202
203203 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i8:
204 ; VI: buffer_load_ushort [[LOAD:v[0-9]]]
205 ; VI: s_load_dword [[IDX:s[0-9]]]
204 ; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
205 ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
206 ; VI-NOT: _load
207 ; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8
206208 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
207 ; VI: v_lshlrev_b16_e64 [[SHL:v[0-9]+]], [[SCALED_IDX]], -1
208 ; VI: v_xor_b32_e32 [[NOT:v[0-9]+]], -1, [[SHL]]
209 ; VI: v_and_b32_e32 [[AND0:v[0-9]+]], 5, [[SHL]]
210 ; VI: v_and_b32_e32 [[AND1:v[0-9]+]], [[NOT]], [[LOAD]]
211 ; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[AND0]], [[AND1]]
209 ; VI: v_lshlrev_b16_e64 [[ELT1_SHIFT:v[0-9]+]], 8, [[ELT1]]
210 ; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
211 ; VI: v_lshlrev_b16_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], -1
212
213 ; VI: v_xor_b32_e32 [[NOT:v[0-9]+]], -1, [[MASK]]
214 ; VI: v_or_b32_e32 [[BUILD_VECTOR:v[0-9]+]], [[ELT0]], [[ELT1_SHIFT]]
215
216 ; VI: v_and_b32_e32 [[AND1:v[0-9]+]], [[NOT]], [[BUILD_VECTOR]]
217 ; VI-DAG: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]]
218 ; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[BUILD_VECTOR]]
212219 ; VI: buffer_store_short [[OR]]
213220 define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
214221 %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
216223 ret void
217224 }
218225
226 ; FIXME: post legalize i16 and i32 shifts aren't merged because of
227 ; isTypeDesirableForOp in SimplifyDemandedBits
228
219229 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i8:
220 ; VI: buffer_load_ubyte
221 ; VI: buffer_load_ushort
222 ; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 3
223 ; VI: s_lshl_b32 s{{[0-9]+}}, 0xffff,
224 ; VI: s_not_b32
225 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
226 ; VI: v_or_b32_e32
227 ; VI: v_and_b32
228 ; VI: v_bfi_b32
229 ; VI: v_lshrrev_b32
230 ; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
231 ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
232 ; VI-NOT: _load
233
234 ; VI: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[LOAD]], 8
235 ; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[VEC_HI]]
236 ; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
237 ; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[VEC_HI]], [[ELT2]]
238 ; VI: s_and_b32 [[ELT2:s[0-9]+]], [[LOAD]], 0xff0000{{$}}
239
240 ; VI: s_mov_b32 [[MASK16:s[0-9]+]], 0xffff{{$}}
241 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
242 ; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], [[MASK16]], [[SCALED_IDX]]
243
244 ; VI: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
245 ; VI: v_or_b32_sdwa [[SDWA:v[0-9]+]], [[BUILD_VEC]], [[V_ELT2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
246 ; VI: s_not_b32 [[NOT_SHIFT_MASK:s[0-9]+]], [[SHIFTED_MASK]]
247 ; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[NOT_SHIFT_MASK]], [[SDWA]]
248 ; VI: v_lshrrev_b32_e32 [[HI2:v[0-9]+]], 16, [[AND_NOT_MASK]]
249 ; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SCALED_IDX]], 5, [[SDWA]]
250 ; VI: buffer_store_short [[BFI]]
251 ; VI: buffer_store_byte [[HI2]]
230252 define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
231253 %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
232254 store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
234256 }
235257
236258 ; GCN-LABEL: {{^}}dynamic_insertelement_v4i8:
237 ; VI: s_load_dword [[VEC:s[0-9]+]]
238 ; VI: s_load_dword [[IDX:s[0-9]]]
259 ; VI: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
260 ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
261 ; VI-NOT: _load
262
263 ; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 8
264 ; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]]
265 ; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff{{$}}
266
267
268 ; VI: s_lshr_b32 [[ELT3:s[0-9]+]], [[VEC]], 24
269 ; VI: s_lshr_b32 [[ELT2:s[0-9]+]], [[VEC]], 16
270 ; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, 8, [[ELT3]]
271 ; VI: v_or_b32_e32
272 ; VI: v_or_b32_sdwa
239273 ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
240 ; VI-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
241 ; VI-DAG: v_mov_b32_e32 [[V_VEC:v[0-9]+]], [[VEC]]
242 ; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[MASK]], 5, [[V_VEC]]
274 ; VI: v_or_b32_sdwa
275 ; VI: s_lshl_b32
276 ; VI: v_bfi_b32
243277 define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
244278 %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
245279 store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
246280 ret void
247281 }
248282
249 ; GCN-LABEL: {{^}}dynamic_insertelement_v8i8:
250 ; VI: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
283 ; GCN-LABEL: {{^}}s_dynamic_insertelement_v8i8:
284 ; VI-NOT: {{buffer|flat|global}}
251285 ; VI: s_load_dword [[IDX:s[0-9]]]
286 ; VI-NOT: {{buffer|flat|global}}
287 ; VI: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
288 ; VI-NOT: {{buffer|flat|global}}
289
252290 ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
253291 ; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0
254292 ; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff
260298 ; VI: v_mov_b32_e32 v[[V_RESULT0:[0-9]+]], s[[RESULT0]]
261299 ; VI: v_mov_b32_e32 v[[V_RESULT1:[0-9]+]], s[[RESULT1]]
262300 ; VI: buffer_store_dwordx2 v{{\[}}[[V_RESULT0]]:[[V_RESULT1]]{{\]}}
263 define amdgpu_kernel void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
301 define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind {
302 %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4
264303 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
265304 store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
266305 ret void
267306 }
268307
269308 ; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:
270 ; GCN: buffer_load_ubyte
271 ; GCN: buffer_load_ubyte
272 ; GCN: buffer_load_ubyte
273 ; GCN: buffer_load_ubyte
274 ; GCN: buffer_load_ubyte
275 ; GCN: buffer_load_ubyte
276 ; GCN: buffer_load_ubyte
277 ; GCN: buffer_load_ubyte
278 ; GCN: buffer_load_ubyte
279 ; GCN: buffer_load_ubyte
280 ; GCN: buffer_load_ubyte
281 ; GCN: buffer_load_ubyte
282 ; GCN: buffer_load_ubyte
283 ; GCN: buffer_load_ubyte
284 ; GCN: buffer_load_ubyte
285 ; GCN: buffer_load_ubyte
309 ; GCN: s_load_dwordx2
310 ; GCN: s_load_dword s
311 ; GCN: s_load_dword s
312 ; GCN: s_load_dword s
313 ; GCN: s_load_dword s
314 ; GCN: s_load_dword s
315 ; GCN-NOT: _load_
316
286317
287318 ; GCN: buffer_store_byte
288319 ; GCN: buffer_store_byte
1010 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
1111 ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
1212 ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
13 ; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
14 ; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
15 ; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
16 ; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
17 ; FIXME: Should be using s_load_dword
18 ; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
13
14 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
15 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
16
1917
2018 define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
2119 entry:
3028 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
3129 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
3230 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
33 ; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
34 ; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
35 ; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
36 ; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
37 ; FIXME: Should be using s_load_dword
38 ; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
39
31
32 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
33 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
4034 define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
4135 entry:
4236 %0 = zext i8 %in to i32
4741 ; FUNC-LABEL: {{^}}i8_sext_arg:
4842 ; HSA-VI: kernarg_segment_byte_size = 12
4943 ; HSA-VI: kernarg_segment_alignment = 4
44 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
45 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
46
47 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
48
49 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
50 ; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]]
51 ; HSA-VI: flat_store_dword
52 define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
53 entry:
54 %0 = sext i8 %in to i32
55 store i32 %0, i32 addrspace(1)* %out, align 4
56 ret void
57 }
58
59 ; FUNC-LABEL: {{^}}i16_arg:
60 ; HSA-VI: kernarg_segment_byte_size = 12
61 ; HSA-VI: kernarg_segment_alignment = 4
62
63 ; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
64 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
65
66 ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
67 ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
68
69 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
70 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
71 ; HSA-VI: flat_store_dword
72 define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
73 entry:
74 %0 = zext i16 %in to i32
75 store i32 %0, i32 addrspace(1)* %out, align 4
76 ret void
77 }
78
79 ; FUNC-LABEL: {{^}}i16_zext_arg:
80 ; HSA-VI: kernarg_segment_byte_size = 12
81 ; HSA-VI: kernarg_segment_alignment = 4
82
5083 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
5184 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
5285 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
53 ; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
54 ; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
55 ; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
56 ; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
57 ; FIXME: Should be using s_load_dword
58 ; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
59
60 define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
61 entry:
62 %0 = sext i8 %in to i32
63 store i32 %0, i32 addrspace(1)* %out, align 4
64 ret void
65 }
66
67 ; FUNC-LABEL: {{^}}i16_arg:
68 ; HSA-VI: kernarg_segment_byte_size = 12
69 ; HSA-VI: kernarg_segment_alignment = 4
70
71 ; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
72 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
73 ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
74 ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
75 ; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
76 ; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
77 ; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
78 ; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
79 ; FIXME: Should be using s_load_dword
80 ; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
81
82 define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
86
87 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
88 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
89 ; HSA-VI: flat_store_dword
90 define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
8391 entry:
8492 %0 = zext i16 %in to i32
8593 store i32 %0, i32 addrspace(1)* %out, align 4
8694 ret void
8795 }
8896
89 ; FUNC-LABEL: {{^}}i16_zext_arg:
97 ; FUNC-LABEL: {{^}}i16_sext_arg:
9098 ; HSA-VI: kernarg_segment_byte_size = 12
9199 ; HSA-VI: kernarg_segment_alignment = 4
92100
93101 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
94102 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
95103 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
96 ; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
97 ; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
98 ; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
99 ; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
100 ; FIXME: Should be using s_load_dword
101 ; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
102
103 define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
104 entry:
105 %0 = zext i16 %in to i32
106 store i32 %0, i32 addrspace(1)* %out, align 4
107 ret void
108 }
109
110 ; FUNC-LABEL: {{^}}i16_sext_arg:
111 ; HSA-VI: kernarg_segment_byte_size = 12
112 ; HSA-VI: kernarg_segment_alignment = 4
113
114 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
115 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
116 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
117 ; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
118 ; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
119 ; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
120 ; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
121 ; FIXME: Should be using s_load_dword
122 ; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
123
104
105
106 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
107 ; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]]
108 ; HSA-VI: flat_store_dword
124109 define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
125110 entry:
126111 %0 = sext i16 %in to i32
162147 ; EG: VTX_READ_8
163148 ; EG: VTX_READ_8
164149
165 ; SI: buffer_load_ubyte
166 ; SI: buffer_load_ubyte
167
168 ; HSA: flat_load_ushort
150 ; GCN: s_load_dword s
151 ; GCN-NOT: {{buffer|flat|global}}_load_
169152 define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
170153 entry:
171154 store <2 x i8> %in, <2 x i8> addrspace(1)* %out
225208 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
226209 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
227210 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
228 ; SI: buffer_load_ubyte
229 ; SI: buffer_load_ubyte
230 ; SI: buffer_load_ubyte
231
232 ; MESA-VI: buffer_load_ushort
233 ; MESA-VI: buffer_load_ubyte
234
235 ; HSA-VI: flat_load_ushort
236 ; HSA-VI: flat_load_ubyte
211
212 ; GCN: s_load_dword s
213 ; GCN-NOT: {{buffer|flat|global}}_load_
237214 define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
238215 entry:
239216 store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
248225 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
249226 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
250227
251 ; GCN-DAG: s_load_dword s
252 ; GCN-DAG: {{buffer|flat}}_load_ushort
228 ; GCN: s_load_dword s
229 ; GCN: s_load_dword s
253230 define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
254231 entry:
255232 store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
293270 ; EG: VTX_READ_8
294271 ; EG: VTX_READ_8
295272
296 ; SI: buffer_load_ubyte
297 ; SI: buffer_load_ubyte
298 ; SI: buffer_load_ubyte
299 ; SI: buffer_load_ubyte
300
301 ; VI: s_load_dword s
273 ; GCN: s_load_dword s
274 ; GCN-NOT: {{buffer|flat|global}}_load_
302275 define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
303276 entry:
304277 store <4 x i8> %in, <4 x i8> addrspace(1)* %out
313286 ; EG: VTX_READ_16
314287 ; EG: VTX_READ_16
315288
316 ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
289 ; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
290 ; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc
317291 ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
318292
319293 ; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c
360334 ret void
361335 }
362336
337 ; FIXME: Lots of unpack and re-pack junk on VI
363338 ; FUNC-LABEL: {{^}}v8i8_arg:
364339 ; HSA-VI: kernarg_segment_byte_size = 16
365340 ; HSA-VI: kernarg_segment_alignment = 4
372347 ; EG: VTX_READ_8
373348 ; EG: VTX_READ_8
374349
375 ; SI: buffer_load_ubyte
376 ; SI: buffer_load_ubyte
377 ; SI: buffer_load_ubyte
378 ; SI: buffer_load_ubyte
379 ; SI: buffer_load_ubyte
380 ; SI: buffer_load_ubyte
381 ; SI: buffer_load_ubyte
382
383 ; VI: s_load_dwordx2
384 ; VI: s_load_dwordx2
350
351 ; SI: s_load_dword s
352 ; SI: s_load_dword s
353 ; SI: s_load_dwordx2 s
354 ; SI-NOT: {{buffer|flat|global}}_load
355
356 ; VI: s_load_dword s
357 ; VI: s_load_dword s
358
359 ; VI: v_lshlrev_b16
360 ; VI: v_or_b32_e32
361 ; VI: v_or_b32_sdwa
362 ; VI: v_or_b32_sdwa
363 ; VI: v_lshlrev_b16
364 ; VI: s_lshr_b32
365 ; VI: v_or_b32_sdwa
366 ; VI: v_or_b32_sdwa
385367 define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
386368 entry:
387369 store <8 x i8> %in, <8 x i8> addrspace(1)* %out
400382 ; EG: VTX_READ_16
401383 ; EG: VTX_READ_16
402384
385 ; SI: s_load_dword s
386 ; SI: s_load_dword s
387 ; SI: s_load_dword s
388 ; SI: s_load_dword s
403389 ; SI: s_load_dwordx2
404 ; SI: s_load_dwordx2
405 ; SI: s_load_dwordx2
390 ; SI-NOT: {{buffer|flat|global}}_load
391
406392
407393 ; VI: s_load_dwordx2
408394 ; VI: s_load_dword s
453439 ret void
454440 }
455441
442 ; FIXME: Pack/repack on VI
443
456444 ; FUNC-LABEL: {{^}}v16i8_arg:
457445 ; HSA-VI: kernarg_segment_byte_size = 32
458446 ; HSA-VI: kernarg_segment_alignment = 4
473461 ; EG: VTX_READ_8
474462 ; EG: VTX_READ_8
475463
476 ; SI: buffer_load_ubyte
477 ; SI: buffer_load_ubyte
478 ; SI: buffer_load_ubyte
479 ; SI: buffer_load_ubyte
480 ; SI: buffer_load_ubyte
481 ; SI: buffer_load_ubyte
482 ; SI: buffer_load_ubyte
483 ; SI: buffer_load_ubyte
484 ; SI: buffer_load_ubyte
485 ; SI: buffer_load_ubyte
486 ; SI: buffer_load_ubyte
487 ; SI: buffer_load_ubyte
488 ; SI: buffer_load_ubyte
489 ; SI: buffer_load_ubyte
490 ; SI: buffer_load_ubyte
491 ; SI: buffer_load_ubyte
492
493 ; VI: s_load_dwordx2
494 ; VI: s_load_dwordx2
495 ; VI: s_load_dwordx2
464 ; SI: s_load_dword s
465 ; SI: s_load_dword s
466 ; SI: s_load_dword s
467 ; SI: s_load_dword s
468 ; SI: s_load_dwordx2
469 ; SI-NOT: {{buffer|flat|global}}_load
470
471
472 ; VI: s_load_dword s
473 ; VI: s_load_dword s
474 ; VI: s_load_dword s
475 ; VI: s_load_dword s
476
477 ; VI: s_lshr_b32
478 ; VI: v_lshlrev_b16
479 ; VI: s_lshr_b32
480 ; VI: s_lshr_b32
481 ; VI: v_or_b32_sdwa
482 ; VI: v_or_b32_sdwa
483 ; VI: v_lshlrev_b16
484 ; VI: v_lshlrev_b16
485 ; VI: v_or_b32_sdwa
486 ; VI: v_or_b32_sdwa
487 ; VI: v_lshlrev_b16
488 ; VI: v_lshlrev_b16
489 ; VI: v_or_b32_sdwa
490 ; VI: v_or_b32_sdwa
496491 define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
497492 entry:
498493 store <16 x i8> %in, <16 x i8> addrspace(1)* %out
507502 ; EG: VTX_READ_16
508503 ; EG: VTX_READ_16
509504 ; EG: VTX_READ_16
510 ; EG: VTX_READ_16
511 ; EG: VTX_READ_16
512 ; EG: VTX_READ_16
513 ; EG: VTX_READ_16
514 ; EG: VTX_READ_16
515 ; EG: VTX_READ_16
516 ; EG: VTX_READ_16
517 ; EG: VTX_READ_16
518 ; EG: VTX_READ_16
519 ; EG: VTX_READ_16
520 ; EG: VTX_READ_16
521
522 ; SI: s_load_dword s
523 ; SI: s_load_dword s
524 ; SI: s_load_dword s
525 ; SI: s_load_dword s
526 ; SI: s_load_dwordx2
527 ; SI: s_load_dwordx2
528 ; SI: s_load_dwordx2
505
506 ; EG: VTX_READ_16
507 ; EG: VTX_READ_16
508 ; EG: VTX_READ_16
509 ; EG: VTX_READ_16
510 ; EG: VTX_READ_16
511 ; EG: VTX_READ_16
512 ; EG: VTX_READ_16
513 ; EG: VTX_READ_16
514 ; EG: VTX_READ_16
515 ; EG: VTX_READ_16
516 ; EG: VTX_READ_16
517
518 ; SI: s_load_dword s
519 ; SI: s_load_dword s
520 ; SI: s_load_dword s
521 ; SI: s_load_dword s
522 ; SI: s_load_dword s
523 ; SI: s_load_dword s
524 ; SI: s_load_dword s
525 ; SI: s_load_dword s
526
527 ; SI-NOT: {{buffer|flat|global}}_load
528
529529
530530 ; VI: s_load_dword s
531531 ; VI: s_load_dword s
633633 ; HSA-VI: kernarg_segment_byte_size = 12
634634 ; HSA-VI: kernarg_segment_alignment = 4
635635
636 ; SI: buffer_load_ubyte
637 ; SI: v_and_b32_e32
638 ; SI: buffer_store_byte
639 ; SI: s_endpgm
636 ; GCN: s_load_dword s
637 ; GCN: s_and_b32
638 ; GCN: {{buffer|flat}}_store_byte
640639 define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
641640 store i1 %x, i1 addrspace(1)* %out, align 1
642641 ret void
646645 ; HSA-VI: kernarg_segment_byte_size = 12
647646 ; HSA-VI: kernarg_segment_alignment = 4
648647
649 ; SI: buffer_load_ubyte
650 ; SI: buffer_store_dword
651 ; SI: s_endpgm
648 ; GCN: s_load_dword
649 ; SGCN: buffer_store_dword
652650 define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
653651 %ext = zext i1 %x to i32
654652 store i32 %ext, i32 addrspace(1)* %out, align 4
659657 ; HSA-VI: kernarg_segment_byte_size = 12
660658 ; HSA-VI: kernarg_segment_alignment = 4
661659
662 ; SI: buffer_load_ubyte
663 ; SI: buffer_store_dwordx2
664 ; SI: s_endpgm
660 ; GCN: s_load_dword s
661 ; GCN: {{buffer|flat}}_store_dwordx2
665662 define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
666663 %ext = zext i1 %x to i64
667664 store i64 %ext, i64 addrspace(1)* %out, align 8
672669 ; HSA-VI: kernarg_segment_byte_size = 12
673670 ; HSA-VI: kernarg_segment_alignment = 4
674671
675 ; SI: buffer_load_ubyte
676 ; SI: buffer_store_dword
677 ; SI: s_endpgm
672 ; GCN: s_load_dword
673 ; GCN: {{buffer|flat}}_store_dword
678674 define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
679675 %ext = sext i1 %x to i32
680676 store i32 %ext, i32addrspace(1)* %out, align 4
685681 ; HSA-VI: kernarg_segment_byte_size = 12
686682 ; HSA-VI: kernarg_segment_alignment = 4
687683
688 ; SI: buffer_load_ubyte
689 ; SI: v_bfe_i32
690 ; SI: v_ashrrev_i32
691 ; SI: buffer_store_dwordx2
692 ; SI: s_endpgm
684 ; GCN: s_load_dword
685 ; GCN: s_bfe_i64
686 ; GCN: {{buffer|flat}}_store_dwordx2
693687 define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
694688 %ext = sext i1 %x to i64
695689 store i64 %ext, i64 addrspace(1)* %out, align 8
22 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX9 %s
33
44 ; GCN-LABEL: {{^}}buffer_store_format_d16_x:
5 ; GCN: {{buffer|flat|global}}_load_ushort v[[LO:[0-9]+]]
6 ; GCN: buffer_store_format_d16_x v[[LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
5 ; GCN: s_load_dword s[[LO:[0-9]+]]
6 ; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]]
7 ; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
78 define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, half %data, i32 %index) {
89 main_body:
910 call void @llvm.amdgcn.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
2323 }
2424
2525 ; GCN-LABEL: {{^}}class_f16_fabs:
26 ; GCN-DAG: buffer_load_ushort v[[SA_F16:[0-9]+]]
27 ; GCN-DAG: s_load_dword s[[SB_I32:[0-9]+]]
28 ; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |v[[SA_F16]]|, s[[SB_I32]]
26 ; GCN: s_load_dword s[[SA_F16:[0-9]+]]
27 ; GCN: s_load_dword s[[SB_I32:[0-9]+]]
28 ; GCN: v_mov_b32_e32 [[V_B_I32:v[0-9]+]], s[[SB_I32]]
29 ; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |s[[SA_F16]]|, [[V_B_I32]]
2930 ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
3031 ; GCN: buffer_store_dword v[[VR_I32]]
3132 ; GCN: s_endpgm
4142 ret void
4243 }
4344
44 ; GCN-LABEL: {{^}}class_f16_fneg
45 ; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]]
45 ; GCN-LABEL: {{^}}class_f16_fneg:
46 ; GCN: s_load_dword s[[SA_F16:[0-9]+]]
4647 ; GCN: s_load_dword s[[SB_I32:[0-9]+]]
47 ; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -v[[SA_F16]], s[[SB_I32]]
48 ; GCN: v_mov_b32_e32 [[V_B_I32:v[0-9]+]], s[[SB_I32]]
49 ; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -s[[SA_F16]], [[V_B_I32]]
4850 ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
4951 ; GCN: buffer_store_dword v[[VR_I32]]
5052 ; GCN: s_endpgm
6062 ret void
6163 }
6264
63 ; GCN-LABEL: {{^}}class_f16_fabs_fneg
64 ; GCN-DAG: buffer_load_ushort v[[SA_F16:[0-9]+]]
65 ; GCN-DAG: s_load_dword s[[SB_I32:[0-9]+]]
66 ; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|v[[SA_F16]]|, s[[SB_I32]]
65 ; GCN-LABEL: {{^}}class_f16_fabs_fneg:
66 ; GCN: s_load_dword s[[SA_F16:[0-9]+]]
67 ; GCN: s_load_dword s[[SB_I32:[0-9]+]]
68 ; GCN: v_mov_b32_e32 [[V_B_I32:v[0-9]+]], s[[SB_I32]]
69 ; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|s[[SA_F16]]|, [[V_B_I32]]
6770 ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
6871 ; GCN: buffer_store_dword v[[VR_I32]]
6972 ; GCN: s_endpgm
8184 }
8285
8386 ; GCN-LABEL: {{^}}class_f16_1:
84 ; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]]
85 ; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], v[[SA_F16]], 1{{$}}
87 ; GCN: s_load_dword s[[SA_F16:[0-9]+]]
88 ; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s[[SA_F16]], 1{{$}}
8689 ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
8790 ; GCN: buffer_store_dword v[[VR_I32]]
8891 ; GCN: s_endpgm
97100 }
98101
99102 ; GCN-LABEL: {{^}}class_f16_64
100 ; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]]
101 ; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], v[[SA_F16]], 64{{$}}
103 ; GCN: s_load_dword s[[SA_F16:[0-9]+]]
104 ; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s[[SA_F16]], 64{{$}}
102105 ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
103106 ; GCN: buffer_store_dword v[[VR_I32]]
104107 ; GCN: s_endpgm
113116 }
114117
115118 ; GCN-LABEL: {{^}}class_f16_full_mask:
116 ; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]]
119 ; GCN: s_load_dword s[[SA_F16:[0-9]+]]
117120 ; VI: v_mov_b32_e32 v[[MASK:[0-9]+]], 0x3ff{{$}}
118 ; VI: v_cmp_class_f16_e32 vcc, v[[SA_F16]], v[[MASK]]
121 ; VI: v_cmp_class_f16_e32 vcc, s[[SA_F16]], v[[MASK]]
119122 ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc
120123 ; GCN: buffer_store_dword v[[VR_I32]]
121124 ; GCN: s_endpgm
129132 ret void
130133 }
131134
132 ; GCN-LABEL: {{^}}class_f16_nine_bit_mask
133 ; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]]
135 ; GCN-LABEL: {{^}}class_f16_nine_bit_mask:
136 ; GCN: s_load_dword s[[SA_F16:[0-9]+]]
134137 ; VI: v_mov_b32_e32 v[[MASK:[0-9]+]], 0x1ff{{$}}
135 ; VI: v_cmp_class_f16_e32 vcc, v[[SA_F16]], v[[MASK]]
138 ; VI: v_cmp_class_f16_e32 vcc, s[[SA_F16]], v[[MASK]]
136139 ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc
137140 ; GCN: buffer_store_dword v[[VR_I32]]
138141 ; GCN: s_endpgm
4949 }
5050
5151 ; GCN-LABEL: {{^}}image_store_f16
52 ; GCN: {{flat|global}}_load_ushort v[[LO:[0-9]+]],
53 ; GCN: image_store v[[LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 unorm d16
52 ; GCN: s_load_dword s[[LO:[0-9]+]],
53 ; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]]
54 ; GCN: image_store v[[V_LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 unorm d16
5455 define amdgpu_kernel void @image_store_f16(half %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) {
5556 main_body:
5657 call void @llvm.amdgcn.image.store.f16.v4i32.v8i32(half %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false)
33
44
55 ; GCN-LABEL: {{^}}tbuffer_store_d16_x:
6 ; GCN: {{flat|global}}_load_ushort v[[LO:[0-9]+]],
7 ; GCN: tbuffer_store_format_d16_x v[[LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
6 ; GCN: s_load_dword s[[S_LO:[0-9]+]]
7 ; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]]
8 ; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
89 define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) {
910 main_body:
1011 call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
1717 ; VI-DAG: s_lshl_b32
1818 ; VI: v_or_b32_e32
1919
20 ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
21 ; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
22 ; CI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
23 ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
20 ; CI: s_load_dword s
21 ; CI-NEXT: s_load_dword s
22 ; CI-NOT: {{buffer|flat}}
23 ; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
24 ; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
25 ; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
26 ; CI: s_and_b32
27 ; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
28 ; CI: s_and_b32
29 ; CI: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16
30 ; CI: s_lshl_b32
31 ; CI: v_or_b32_e32
2432 define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
2533 %result = lshr <2 x i16> %lhs, %rhs
2634 store <2 x i16> %result, <2 x i16> addrspace(1)* %out
7575 ; extloads with mubuf instructions.
7676
7777 ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8:
78 ; GCN: buffer_load_sbyte
79 ; GCN: buffer_load_sbyte
80 ; GCN: buffer_load_sbyte
81 ; GCN: buffer_load_sbyte
82 ; GCN: buffer_load_sbyte
83 ; GCN: buffer_load_sbyte
84 ; GCN: buffer_load_sbyte
85 ; GCN: buffer_load_sbyte
86
87 ; SI: v_min_i32
88 ; SI: v_min_i32
89 ; SI: v_min_i32
90 ; SI: v_min_i32
91
92 ; VI: v_min_i32
93 ; VI: v_min_i32
94 ; VI: v_min_i32
95 ; VI: v_min_i32
78 ; GCN: s_load_dword s
79 ; GCN: s_load_dword s
80 ; GCN-NOT: _load_
81
82 ; SI: s_min_i32
83 ; SI: s_min_i32
84 ; SI: s_min_i32
85 ; SI: s_min_i32
86
87 ; VI: s_min_i32
88 ; VI: s_min_i32
89 ; VI: s_min_i32
90 ; VI: s_min_i32
9691
9792 ; GFX9: v_min_i16
9893 ; GFX9: v_min_i16
9994 ; GFX9: v_min_i16
10095 ; GFX9: v_min_i16
10196
102 ; GCN: s_endpgm
103
10497 ; EG: MIN_INT
10598 ; EG: MIN_INT
10699 ; EG: MIN_INT
113106 }
114107
115108 ; FUNC-LABEL: {{^}}s_test_imin_sle_v2i16:
116 ; SI: v_min_i32
117 ; SI: v_min_i32
109 ; GCN: s_load_dword s
110 ; GCN: s_load_dword s
111
112 ; SI: s_ashr_i32
113 ; SI: s_ashr_i32
114 ; SI: s_sext_i32_i16
115 ; SI: s_sext_i32_i16
116 ; SI: s_min_i32
117 ; SI: s_min_i32
118118
119119 ; VI: s_sext_i32_i16
120120 ; VI: s_sext_i32_i16
133133 }
134134
135135 ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16:
136 ; SI: v_min_i32
137 ; SI: v_min_i32
138 ; SI: v_min_i32
139 ; SI: v_min_i32
136 ; SI-NOT: buffer_load
137 ; SI: s_min_i32
138 ; SI: s_min_i32
139 ; SI: s_min_i32
140 ; SI: s_min_i32
140141
141142 ; VI: s_min_i32
142143 ; VI: s_min_i32
452453 }
453454
454455 ; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16:
455 ; SI: v_min_u32
456 ; SI: v_min_u32
457 ; SI: v_min_u32
458 ; SI: v_min_u32
459 ; SI: v_min_u32
460 ; SI: v_min_u32
461 ; SI: v_min_u32
462 ; SI: v_min_u32
456 ; GCN-NOT: {{buffer|flat|global}}_load
457 ; SI: s_min_u32
458 ; SI: s_min_u32
459 ; SI: s_min_u32
460 ; SI: s_min_u32
461 ; SI: s_min_u32
462 ; SI: s_min_u32
463 ; SI: s_min_u32
464 ; SI: s_min_u32
463465
464466 ; VI: s_min_u32
465467 ; VI: s_min_u32
3939
4040 ; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4:
4141 ; GCN: s_load_dword s
42 ; GCN: s_load_dwordx2 s
42 ; GCN-NEXT: s_load_dword s
43 ; GCN-NEXT: s_load_dword s
44 ; GCN-NOT: {{buffer|flat|global}}
45
4346 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
4447 define amdgpu_kernel void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 {
4548 %x.bc = bitcast <4 x i16> %x to <2 x i32>
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
22
3 ; FIXME: This should go in existing select.ll test, except the current testcase there is broken on SI
3 ; FIXME: This should go in existing select.ll test, except the current testcase there is broken on GCN
44
5 ; FUNC-LABEL: {{^}}select_i1:
6 ; SI: v_cndmask_b32
7 ; SI-NOT: v_cndmask_b32
5 ; GCN-LABEL: {{^}}select_i1:
6 ; GCN: v_cndmask_b32
7 ; GCN-NOT: v_cndmask_b32
88 define amdgpu_kernel void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind {
99 %cmp = icmp ugt i32 %cond, 5
1010 %sel = select i1 %cmp, i1 %a, i1 %b
1212 ret void
1313 }
1414
15 ; FUNC-LABEL: {{^}}s_minmax_i1:
16 ; SI-DAG: buffer_load_ubyte [[COND:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
17 ; SI-DAG: buffer_load_ubyte [[A:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:45
18 ; SI-DAG: buffer_load_ubyte [[B:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
19 ; SI: v_cmp_eq_u32_e32 vcc, 1, [[COND]]
20 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
15 ; GCN-LABEL: {{^}}s_minmax_i1:
16 ; GCN: s_load_dword [[LOAD:s[0-9]+]],
17 ; GCN-DAG: s_lshr_b32 [[A:s[0-9]+]], [[LOAD]], 8
18 ; GCN-DAG: s_lshr_b32 [[B:s[0-9]+]], [[LOAD]], 16
19 ; GCN-DAG: s_and_b32 [[COND:s[0-9]+]], 1, [[LOAD]]
20 ; GCN-DAG: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]]
21 ; GCN-DAG: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]]
22 ; GCN: v_cmp_eq_u32_e64 vcc, [[COND]], 1
23 ; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], [[V_B]], [[V_A]]
24 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, [[SEL]]
2125 define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
2226 %cmp = icmp slt i1 %cond, false
2327 %sel = select i1 %cmp, i1 %a, i1 %b
662662
663663 ; FUNC-LABEL: {{^}}sext_in_reg_v3i1_to_v3i16:
664664 ; GFX9: v_pk_add_u16
665 ; GFX9: v_pk_add_u16
666 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}}
665667 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}}
666668 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
667 ; GFX9: v_pk_add_u16
668 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}}
669669 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
670670 define amdgpu_kernel void @sext_in_reg_v3i1_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 {
671671 %c = add <3 x i16> %a, %b ; add to prevent folding into extload
701701
702702 ; FUNC-LABEL: {{^}}sext_in_reg_v3i8_to_v3i16:
703703 ; GFX9: v_pk_add_u16
704 ; GFX9: v_pk_add_u16
705 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
704706 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
705707 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
706
707 ; GFX9: v_pk_add_u16
708 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
709708 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
710709 define amdgpu_kernel void @sext_in_reg_v3i8_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 {
711710 %c = add <3 x i16> %a, %b ; add to prevent folding into extload
1313 ; VI: s_lshr_b32
1414 ; VI: s_and_b32
1515 ; VI: s_and_b32
16 ; SI: s_and_B32
17 ; SI: s_or_b32
16 ; VI: s_and_b32
17 ; VI: s_or_b32
1818
19 ; CI-DAG: v_lshlrev_b32_e32
20 ; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
21 ; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
22 ; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
23 ; CI: v_or_b32_e32
19
20 ; CI: s_load_dword s
21 ; CI: s_load_dword s
22 ; CI: s_lshr_b32
23 ; CI: s_and_b32
24 ; CI: s_lshr_b32
25 ; CI: s_lshl_b32
26 ; CI: s_lshl_b32
27 ; CI: s_lshl_b32
28 ; CI: s_and_b32
29 ; CI: s_or_b32
30 ; CI: _store_dword
2431 define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
2532 %result = shl <2 x i16> %lhs, %rhs
2633 store <2 x i16> %result, <2 x i16> addrspace(1)* %out
1717 ; SI: s_and_b32
1818 ; SI: s_or_b32
1919
20 ; CI: v_sub_i32_e32
21 ; CI-DAG: v_sub_i32_e32
22 ; CI: v_bfe_i32
23 ; CI-DAG: v_bfe_i32
24 ; CI-DAG: v_add_i32_e32
25 ; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16
26 ; CI: v_add_i32_e32
27 ; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
28 ; CI: v_or_b32_e32
20 ; CI-NOT: {{buffer|flat}}_load
21 ; CI: s_load_dword s
22 ; CI-NOT: {{buffer|flat}}_load
23 ; CI: s_lshr_b32
24 ; CI: s_ashr_i32
25 ; CI: s_sext_i32_i16
26 ; CI: s_sub_i32
27 ; CI: s_sub_i32
28 ; CI: s_sext_i32_i16
29 ; CI: s_sext_i32_i16
30 ; CI: s_max_i32
31 ; CI: s_max_i32
32 ; CI: s_lshl_b32
33 ; CI: s_add_i32
34 ; CI: s_add_i32
35 ; CI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xffff
36 ; CI: s_or_b32
37
2938 define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
3039 %neg = sub <2 x i16> zeroinitializer, %val
3140 %cond = icmp sgt <2 x i16> %val, %neg
0 ; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
1 ; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
2
3 ; GCN-LABEL: {{^}}widen_i16_constant_load:
4 ; GCN: s_load_dword [[VAL:s[0-9]+]]
5 ; GCN: s_addk_i32 [[VAL]], 0x3e7
6 ; GCN: s_or_b32 [[OR:s[0-9]+]], [[VAL]], 4
7 define amdgpu_kernel void @widen_i16_constant_load(i16 addrspace(4)* %arg) {
8 %load = load i16, i16 addrspace(4)* %arg, align 4
9 %add = add i16 %load, 999
10 %or = or i16 %add, 4
11 store i16 %or, i16 addrspace(1)* null
12 ret void
13 }
14
15 ; GCN-LABEL: {{^}}widen_i16_constant_load_zext_i32:
16 ; GCN: s_load_dword [[VAL:s[0-9]+]]
17 ; GCN: s_and_b32 [[TRUNC:s[0-9]+]], [[VAL]], 0xffff{{$}}
18 ; GCN: s_addk_i32 [[TRUNC]], 0x3e7
19 ; GCN: s_or_b32 [[OR:s[0-9]+]], [[TRUNC]], 4
20 define amdgpu_kernel void @widen_i16_constant_load_zext_i32(i16 addrspace(4)* %arg) {
21 %load = load i16, i16 addrspace(4)* %arg, align 4
22 %ext = zext i16 %load to i32
23 %add = add i32 %ext, 999
24 %or = or i32 %add, 4
25 store i32 %or, i32 addrspace(1)* null
26 ret void
27 }
28
29 ; GCN-LABEL: {{^}}widen_i16_constant_load_sext_i32:
30 ; GCN: s_load_dword [[VAL:s[0-9]+]]
31 ; GCN: s_sext_i32_i16 [[EXT:s[0-9]+]], [[VAL]]
32 ; GCN: s_addk_i32 [[EXT]], 0x3e7
33 ; GCN: s_or_b32 [[OR:s[0-9]+]], [[EXT]], 4
34 define amdgpu_kernel void @widen_i16_constant_load_sext_i32(i16 addrspace(4)* %arg) {
35 %load = load i16, i16 addrspace(4)* %arg, align 4
36 %ext = sext i16 %load to i32
37 %add = add i32 %ext, 999
38 %or = or i32 %add, 4
39 store i32 %or, i32 addrspace(1)* null
40 ret void
41 }
42
43 ; GCN-LABEL: {{^}}widen_i17_constant_load:
44 ; GCN: s_load_dword [[VAL:s[0-9]+]]
45 ; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 34
46 ; GCN: s_or_b32 [[OR:s[0-9]+]], [[ADD]], 4
47 ; GCN: s_bfe_u32 s{{[0-9]+}}, [[OR]], 0x10010
48 define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) {
49 %load = load i17, i17 addrspace(4)* %arg, align 4
50 %add = add i17 %load, 34
51 %or = or i17 %add, 4
52 store i17 %or, i17 addrspace(1)* null
53 ret void
54 }
55
56 ; GCN-LABEL: {{^}}widen_f16_constant_load:
57 ; GCN: s_load_dword [[VAL:s[0-9]+]]
58 ; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[VAL]]
59 ; SI: v_add_f32_e32 [[ADD:v[0-9]+]], 4.0, [[CVT]]
60
61 ; VI: v_add_f16_e64 [[ADD:v[0-9]+]], [[VAL]], 4.0
62 define amdgpu_kernel void @widen_f16_constant_load(half addrspace(4)* %arg) {
63 %load = load half, half addrspace(4)* %arg, align 4
64 %add = fadd half %load, 4.0
65 store half %add, half addrspace(1)* null
66 ret void
67 }
68
69 ; FIXME: valu usage on VI
70 ; GCN-LABEL: {{^}}widen_v2i8_constant_load:
71 ; GCN: s_load_dword [[VAL:s[0-9]+]]
72
73 ; SI: s_add_i32
74 ; SI: s_or_b32
75 ; SI: s_addk_i32
76 ; SI: s_and_b32
77 ; SI: s_or_b32
78 ; SI: s_or_b32
79
80 ; VI: s_add_i32
81 ; VI: v_add_u32_sdwa
82 ; VI: v_or_b32_sdwa
83 ; VI: v_or_b32_e32
84 define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg) {
85 %load = load <2 x i8>, <2 x i8> addrspace(4)* %arg, align 4
86 %add = add <2 x i8> %load,
87 %or = or <2 x i8> %add,
88 store <2 x i8> %or, <2 x i8> addrspace(1)* null
89 ret void
90 }
91
92 ; GCN-LABEL: {{^}}no_widen_i16_constant_divergent_load:
93 ; GCN: {{buffer|flat}}_load_ushort
94 define amdgpu_kernel void @no_widen_i16_constant_divergent_load(i16 addrspace(4)* %arg) {
95 %tid = call i32 @llvm.amdgcn.workitem.id.x()
96 %tid.ext = zext i32 %tid to i64
97 %gep.arg = getelementptr inbounds i16, i16 addrspace(4)* %arg, i64 %tid.ext
98 %load = load i16, i16 addrspace(4)* %gep.arg, align 4
99 %add = add i16 %load, 999
100 %or = or i16 %add, 4
101 store i16 %or, i16 addrspace(1)* null
102 ret void
103 }
104
105 ; GCN-LABEL: {{^}}widen_i1_constant_load:
106 ; GCN: s_load_dword [[VAL:s[0-9]+]]
107 ; GCN: s_and_b32 {{s[0-9]+}}, [[VAL]], 1{{$}}
108 define amdgpu_kernel void @widen_i1_constant_load(i1 addrspace(4)* %arg) {
109 %load = load i1, i1 addrspace(4)* %arg, align 4
110 %and = and i1 %load, true
111 store i1 %and, i1 addrspace(1)* null
112 ret void
113 }
114
115 ; GCN-LABEL: {{^}}widen_i16_zextload_i64_constant_load:
116 ; GCN: s_load_dword [[VAL:s[0-9]+]]
117 ; GCN: s_and_b32 [[TRUNC:s[0-9]+]], [[VAL]], 0xffff{{$}}
118 ; GCN: s_addk_i32 [[TRUNC]], 0x3e7
119 ; GCN: s_or_b32 [[OR:s[0-9]+]], [[TRUNC]], 4
120 define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(i16 addrspace(4)* %arg) {
121 %load = load i16, i16 addrspace(4)* %arg, align 4
122 %zext = zext i16 %load to i32
123 %add = add i32 %zext, 999
124 %or = or i32 %add, 4
125 store i32 %or, i32 addrspace(1)* null
126 ret void
127 }
128
129 ; GCN-LABEL: {{^}}widen_i1_zext_to_i64_constant_load:
130 ; GCN: s_load_dword [[VAL:s[0-9]+]]
131 ; GCN: s_and_b32 [[AND:s[0-9]+]], [[VAL]], 1
132 ; GCN: s_add_u32 [[ADD:s[0-9]+]], [[AND]], 0x3e7
133 ; GCN: s_addc_u32 s{{[0-9]+}}, 0, 0
134 define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(i1 addrspace(4)* %arg) {
135 %load = load i1, i1 addrspace(4)* %arg, align 4
136 %zext = zext i1 %load to i64
137 %add = add i64 %zext, 999
138 store i64 %add, i64 addrspace(1)* null
139 ret void
140 }
141
142 ; GCN-LABEL: {{^}}widen_i16_constant32_load:
143 ; GCN: s_load_dword [[VAL:s[0-9]+]]
144 ; GCN: s_addk_i32 [[VAL]], 0x3e7
145 ; GCN: s_or_b32 [[OR:s[0-9]+]], [[VAL]], 4
146 define amdgpu_kernel void @widen_i16_constant32_load(i16 addrspace(6)* %arg) {
147 %load = load i16, i16 addrspace(6)* %arg, align 4
148 %add = add i16 %load, 999
149 %or = or i16 %add, 4
150 store i16 %or, i16 addrspace(1)* null
151 ret void
152 }
153
154 ; GCN-LABEL: {{^}}widen_i16_global_invariant_load:
155 ; GCN: s_load_dword [[VAL:s[0-9]+]]
156 ; GCN: s_addk_i32 [[VAL]], 0x3e7
157 ; GCN: s_or_b32 [[OR:s[0-9]+]], [[VAL]], 1
158 define amdgpu_kernel void @widen_i16_global_invariant_load(i16 addrspace(1)* %arg) {
159 %load = load i16, i16 addrspace(1)* %arg, align 4, !invariant.load !0
160 %add = add i16 %load, 999
161 %or = or i16 %add, 1
162 store i16 %or, i16 addrspace(1)* null
163 ret void
164 }
165
166 declare i32 @llvm.amdgcn.workitem.id.x()
167
168 !0 = !{}