llvm.org GIT mirror llvm / a2ba13d
AMDGPU: Add pass to lower kernel arguments to loads This replaces most argument uses with loads, but for now not all. The code in SelectionDAG for calling convention lowering is actively harmful for amdgpu_kernel. It attempts to split the argument types into register legal types, which results in low quality code for arbitary types. Since all kernel arguments are passed in memory, we just want the raw types. I've tried a couple of methods of mitigating this in SelectionDAG, but it's easier to just bypass this problem alltogether. It's possible to hack around the problem in the initial lowering, but the real problem is the DAG then expects to be able to use CopyToReg/CopyFromReg for uses of the arguments outside the block. Exposing the argument loads in the IR also has the advantage that the LoadStoreVectorizer can merge them. I'm not sure the best approach to dealing with the IR argument list is. The patch as-is just leaves the IR arguments in place, so all the existing code will still compute the same kernarg size and pointlessly lowers the arguments. Arguably the frontend should emit kernels with an empty argument list in the first place. Alternatively a dummy array could be inserted as a single argument just to reserve space. This does have some disadvantages. Local pointer kernel arguments can no longer have AssertZext placed on them as the equivalent !range metadata is not valid on pointer typed loads. This is mostly bad for SI which needs to know about the known bits in order to use the DS instruction offset, so in this case this is not done. More importantly, this skips noalias arguments since this pass does not yet convert this to the equivalent !alias.scope and !noalias metadata. Producing this metadata correctly seems to be tricky, although this logically is the same as inlining into a function which doesn't exist. Additionally, exposing these loads to the vectorizer may result in degraded aliasing information if a pointer load is merged with another argument load. I'm also not entirely sure this is preserving the current clover ABI, although I would greatly prefer if it would stop widening arguments and match the HSA ABI. As-is I think it is extending < 4-byte arguments to 4-bytes but doesn't align them to 4-bytes. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@335650 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 1 year, 7 months ago
130 changed file(s) with 3144 addition(s) and 1572 deletion(s). Raw diff Collapse all Expand all
7272 void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
7373 extern char &AMDGPULowerIntrinsicsID;
7474
75 FunctionPass *createAMDGPULowerKernelArgumentsPass();
76 void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
77 extern char &AMDGPULowerKernelArgumentsID;
78
7579 ModulePass *createAMDGPULowerKernelAttributesPass();
7680 void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
7781 extern char &AMDGPULowerKernelAttributesID;
0 //===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass replaces accesses to kernel arguments with loads from
10 /// offsets from the kernarg base pointer.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPU.h"
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/Analysis/DivergenceAnalysis.h"
19 #include "llvm/Analysis/Loads.h"
20 #include "llvm/CodeGen/Passes.h"
21 #include "llvm/CodeGen/TargetPassConfig.h"
22 #include "llvm/IR/Attributes.h"
23 #include "llvm/IR/BasicBlock.h"
24 #include "llvm/IR/Constants.h"
25 #include "llvm/IR/DerivedTypes.h"
26 #include "llvm/IR/Function.h"
27 #include "llvm/IR/IRBuilder.h"
28 #include "llvm/IR/InstrTypes.h"
29 #include "llvm/IR/Instruction.h"
30 #include "llvm/IR/Instructions.h"
31 #include "llvm/IR/LLVMContext.h"
32 #include "llvm/IR/MDBuilder.h"
33 #include "llvm/IR/Metadata.h"
34 #include "llvm/IR/Operator.h"
35 #include "llvm/IR/Type.h"
36 #include "llvm/IR/Value.h"
37 #include "llvm/Pass.h"
38 #include "llvm/Support/Casting.h"
39
40 #define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
41
42 using namespace llvm;
43
44 namespace {
45
46 class AMDGPULowerKernelArguments : public FunctionPass{
47 public:
48 static char ID;
49
50 AMDGPULowerKernelArguments() : FunctionPass(ID) {}
51
52 bool runOnFunction(Function &F) override;
53
54 void getAnalysisUsage(AnalysisUsage &AU) const override {
55 AU.addRequired();
56 AU.setPreservesAll();
57 }
58 };
59
60 } // end anonymous namespace
61
62 bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
63 CallingConv::ID CC = F.getCallingConv();
64 if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
65 return false;
66
67 auto &TPC = getAnalysis();
68
69 const TargetMachine &TM = TPC.getTM();
70 const SISubtarget &ST = TM.getSubtarget(F);
71 LLVMContext &Ctx = F.getParent()->getContext();
72 const DataLayout &DL = F.getParent()->getDataLayout();
73 BasicBlock &EntryBlock = *F.begin();
74 IRBuilder<> Builder(&*EntryBlock.begin());
75
76 SmallVector ArgTypes;
77 for (Argument &Arg : F.args()) {
78 Type *ArgTy = Arg.getType();
79 unsigned Size = DL.getTypeStoreSizeInBits(ArgTy);
80 bool IsExtArg = Size < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) &&
81 !ST.isAmdHsaOS();
82
83 // Clover seems to always pad i8/i16 to i32, but doesn't properly align
84 // them?
85 // Make sure the struct elements have correct size and alignment for ext
86 // args. These seem to be padded up to 4-bytes but not correctly aligned.
87 ArgTypes.push_back(
88 IsExtArg ? ArrayType::get(ArgTy, 32 / Size) : Arg.getType());
89 }
90
91 StructType *ArgStructTy = StructType::create(Ctx, ArgTypes, F.getName());
92 const StructLayout *Layout = DL.getStructLayout(ArgStructTy);
93
94 // Minimum alignment for kern segment is 16.
95 unsigned KernArgBaseAlign = std::max(16u, DL.getABITypeAlignment(ArgStructTy));
96 const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
97
98 // FIXME: Alignment is broken broken with explicit arg offset.;
99 const uint64_t TotalKernArgSize = BaseOffset +
100 ST.getKernArgSegmentSize(F, DL.getTypeAllocSize(ArgStructTy));
101
102 CallInst *KernArgSegment =
103 Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, nullptr,
104 F.getName() + ".kernarg.segment");
105
106 KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
107 KernArgSegment->addAttribute(AttributeList::ReturnIndex,
108 Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
109 KernArgSegment->addAttribute(AttributeList::ReturnIndex,
110 Attribute::getWithAlignment(Ctx, KernArgBaseAlign));
111
112 Value *KernArgBase = KernArgSegment;
113 if (BaseOffset != 0) {
114 KernArgBase = Builder.CreateConstInBoundsGEP1_64(KernArgBase, BaseOffset);
115 KernArgBaseAlign = MinAlign(KernArgBaseAlign, BaseOffset);
116 }
117
118 unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
119 Value *CastStruct = Builder.CreateBitCast(KernArgBase,
120 ArgStructTy->getPointerTo(AS));
121 for (Argument &Arg : F.args()) {
122 if (Arg.use_empty())
123 continue;
124
125 Type *ArgTy = Arg.getType();
126 if (PointerType *PT = dyn_cast(ArgTy)) {
127 // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
128 // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
129 // can't represent this with range metadata because it's only allowed for
130 // integer types.
131 if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
132 ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
133 continue;
134
135 // FIXME: We can replace this with equivalent alias.scope/noalias
136 // metadata, but this appears to be a lot of work.
137 if (Arg.hasNoAliasAttr())
138 continue;
139 }
140
141 VectorType *VT = dyn_cast(ArgTy);
142 bool IsV3 = VT && VT->getNumElements() == 3;
143 VectorType *V4Ty = nullptr;
144
145 unsigned Size = DL.getTypeSizeInBits(ArgTy);
146 bool IsExtArg = Size < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) &&
147 !ST.isAmdHsaOS();
148 int64_t EltOffset = Layout->getElementOffset(Arg.getArgNo());
149 int64_t AlignDownOffset = alignDown(EltOffset, 4);
150 int64_t OffsetDiff = EltOffset - AlignDownOffset;
151 unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset);
152
153 Value *ArgPtr;
154 if (Size < 32) {
155 // Since we don't have sub-dword scalar loads, avoid doing an extload by
156 // loading earlier than the argument address, and extracting the relevant
157 // bits.
158 //
159 // Additionally widen any sub-dword load to i32 even if suitably aligned,
160 // so that CSE between different argument loads works easily.
161
162 ArgPtr = Builder.CreateConstGEP1_64(KernArgBase, AlignDownOffset);
163 ArgPtr = Builder.CreateBitCast(
164 ArgPtr,
165 Builder.getInt32Ty()->getPointerTo(AS),
166 Arg.getName() + ".kernarg.offset.align.down");
167 } else {
168 ArgPtr = Builder.CreateStructGEP(CastStruct, Arg.getArgNo(),
169 Arg.getName() + ".kernarg.offset");
170 }
171
172 assert((!IsExtArg || !IsV3) && "incompatible situation");
173
174
175 if (IsV3 && Size >= 32) {
176 V4Ty = VectorType::get(VT->getVectorElementType(), 4);
177 // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
178 ArgPtr = Builder.CreateBitCast(ArgPtr, V4Ty->getPointerTo(AS));
179 }
180
181 LoadInst *Load = Builder.CreateAlignedLoad(ArgPtr, AdjustedAlign);
182 Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
183
184 MDBuilder MDB(Ctx);
185
186 if (isa(ArgTy)) {
187 if (Arg.hasNonNullAttr())
188 Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));
189
190 uint64_t DerefBytes = Arg.getDereferenceableBytes();
191 if (DerefBytes != 0) {
192 Load->setMetadata(
193 LLVMContext::MD_dereferenceable,
194 MDNode::get(Ctx,
195 MDB.createConstant(
196 ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
197 }
198
199 uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
200 if (DerefOrNullBytes != 0) {
201 Load->setMetadata(
202 LLVMContext::MD_dereferenceable_or_null,
203 MDNode::get(Ctx,
204 MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
205 DerefOrNullBytes))));
206 }
207
208 unsigned ParamAlign = Arg.getParamAlignment();
209 if (ParamAlign != 0) {
210 Load->setMetadata(
211 LLVMContext::MD_align,
212 MDNode::get(Ctx,
213 MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
214 ParamAlign))));
215 }
216 }
217
218 // TODO: Convert noalias arg to !noalias
219
220 if (Size < 32) {
221 if (IsExtArg && OffsetDiff == 0) {
222 Type *I32Ty = Builder.getInt32Ty();
223 bool IsSext = Arg.hasSExtAttr();
224 Metadata *LowAndHigh[] = {
225 ConstantAsMetadata::get(
226 ConstantInt::get(I32Ty, IsSext ? minIntN(Size) : 0)),
227 ConstantAsMetadata::get(
228 ConstantInt::get(I32Ty,
229 IsSext ? maxIntN(Size) + 1 : maxUIntN(Size) + 1))
230 };
231
232 Load->setMetadata(LLVMContext::MD_range, MDNode::get(Ctx, LowAndHigh));
233 }
234
235 Value *ExtractBits = OffsetDiff == 0 ?
236 Load : Builder.CreateLShr(Load, OffsetDiff * 8);
237
238 IntegerType *ArgIntTy = Builder.getIntNTy(Size);
239 Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
240 Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,
241 Arg.getName() + ".load");
242 Arg.replaceAllUsesWith(NewVal);
243 } else if (IsV3) {
244 Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty),
245 {0, 1, 2},
246 Arg.getName() + ".load");
247 Arg.replaceAllUsesWith(Shuf);
248 } else {
249 Load->setName(Arg.getName() + ".load");
250 Arg.replaceAllUsesWith(Load);
251 }
252 }
253
254 return true;
255 }
256
257 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
258 "AMDGPU Lower Kernel Arguments", false, false)
259 INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments",
260 false, false)
261
262 char AMDGPULowerKernelArguments::ID = 0;
263
264 FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() {
265 return new AMDGPULowerKernelArguments();
266 }
129129 cl::init(true),
130130 cl::Hidden);
131131
132 static cl::opt EnableLowerKernelArguments(
133 "amdgpu-ir-lower-kernel-arguments",
134 cl::desc("Lower kernel argument loads in IR pass"),
135 cl::init(true),
136 cl::Hidden);
137
132138 extern "C" void LLVMInitializeAMDGPUTarget() {
133139 // Register the target
134140 RegisterTargetMachine X(getTheAMDGPUTarget());
154160 initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
155161 initializeAMDGPUAnnotateUniformValuesPass(*PR);
156162 initializeAMDGPUArgumentUsageInfoPass(*PR);
163 initializeAMDGPULowerKernelArgumentsPass(*PR);
157164 initializeAMDGPULowerKernelAttributesPass(*PR);
158165 initializeAMDGPULowerIntrinsicsPass(*PR);
159166 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
668675 }
669676
670677 void AMDGPUPassConfig::addCodeGenPrepare() {
678 if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
679 EnableLowerKernelArguments)
680 addPass(createAMDGPULowerKernelArgumentsPass());
681
671682 TargetPassConfig::addCodeGenPrepare();
672683
673684 if (EnableLoadStoreVectorizer)
3939 AMDGPULibCalls.cpp
4040 AMDGPULibFunc.cpp
4141 AMDGPULowerIntrinsics.cpp
42 AMDGPULowerKernelArguments.cpp
4243 AMDGPULowerKernelAttributes.cpp
4344 AMDGPUMachineCFGStructurizer.cpp
4445 AMDGPUMachineFunction.cpp
88 ; GCN-LABEL: {{^}}smrd0:
99 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
1010 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
11 define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
11 define amdgpu_kernel void @smrd0(i32 addrspace(4)* %ptr) {
1212 entry:
1313 %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 1
1414 %1 = load i32, i32 addrspace(4)* %0
15 store i32 %1, i32 addrspace(1)* %out
15 store i32 %1, i32 addrspace(1)* undef
1616 ret void
1717 }
1818
2020 ; GCN-LABEL: {{^}}smrd1:
2121 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
2222 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
23 define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
23 define amdgpu_kernel void @smrd1(i32 addrspace(4)* %ptr) {
2424 entry:
2525 %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 255
2626 %1 = load i32, i32 addrspace(4)* %0
27 store i32 %1, i32 addrspace(1)* %out
27 store i32 %1, i32 addrspace(1)* undef
2828 ret void
2929 }
3030
3535 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
3636 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
3737 ; GCN: s_endpgm
38 define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
38 define amdgpu_kernel void @smrd2(i32 addrspace(4)* %ptr) {
3939 entry:
4040 %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 256
4141 %1 = load i32, i32 addrspace(4)* %0
42 store i32 %1, i32 addrspace(1)* %out
42 store i32 %1, i32 addrspace(1)* undef
4343 ret void
4444 }
4545
5050 ; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
5151 ; TODO: Add VI checks
5252 ; XGCN: s_endpgm
53 define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
53 define amdgpu_kernel void @smrd3(i32 addrspace(4)* %ptr) {
5454 entry:
5555 %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 4294967296 ; 2 ^ 32
5656 %1 = load i32, i32 addrspace(4)* %0
57 store i32 %1, i32 addrspace(1)* %out
57 store i32 %1, i32 addrspace(1)* undef
5858 ret void
5959 }
6060
6464 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
6565 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
6666 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
67 define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
67 define amdgpu_kernel void @smrd4(i32 addrspace(4)* %ptr) {
6868 entry:
6969 %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262143
7070 %1 = load i32, i32 addrspace(4)* %0
71 store i32 %1, i32 addrspace(1)* %out
71 store i32 %1, i32 addrspace(1)* undef
7272 ret void
7373 }
7474
7878 ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
7979 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
8080 ; GCN: s_endpgm
81 define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
81 define amdgpu_kernel void @smrd5(i32 addrspace(4)* %ptr) {
8282 entry:
8383 %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262144
8484 %1 = load i32, i32 addrspace(4)* %0
85 store i32 %1, i32 addrspace(1)* %out
85 store i32 %1, i32 addrspace(1)* undef
8686 ret void
8787 }
8888
7575 ; SI-NOT: addc
7676 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
7777 ; SI: buffer_store_dword [[VRESULT]],
78 define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
78 define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i32, i64 %a, i32, i64 %b) {
7979 %add = add i64 %b, %a
8080 %trunc = trunc i64 %add to i32
8181 store i32 %trunc, i32 addrspace(1)* %out, align 8
22
33 ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
44 ; TRAP-HANDLER-ENABLE: NumSgprs: 60
5 ; TRAP-HANDLER-DISABLE: NumSgprs: 76
5 ; TRAP-HANDLER-DISABLE: NumSgprs: 78
66 define amdgpu_kernel void @amdhsa_trap_num_sgprs(
77 i32 addrspace(1)* %out0, i32 %in0,
88 i32 addrspace(1)* %out1, i32 %in1,
216216 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}}
217217 ; SI-NOT: and
218218 ; SI: buffer_store_dwordx2
219 define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
219 define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i32, i64 %a) {
220220 %and = and i64 %a, 1234567
221221 store i64 %and, i64 addrspace(1)* %out, align 8
222222 ret void
234234 ; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
235235 ; SI-NOT: and
236236 ; SI: buffer_store_dwordx2
237 define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
237 define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) {
238238 %shl.a = shl i64 %a, 1
239239 %shl.b = shl i64 %b, 1
240240 %and0 = and i64 %shl.a, 62
380380 ; SI-NOT: and
381381 ; SI: s_add_u32
382382 ; SI-NEXT: s_addc_u32
383 define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
383 define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i32, i64 %b) {
384384 %shl = shl i64 %a, 1
385385 %and = and i64 %shl, 64
386386 %add = add i64 %and, %b
1111 ; CIVI: s_load_dword [[LHS:s[0-9]+]]
1212 ; CIVI: s_load_dword [[RHS:s[0-9]+]]
1313
14 ; VI: s_ashr_i32
15 ; VI: s_ashr_i32
16 ; VI: s_sext_i32_i16
17 ; VI: s_sext_i32_i16
18 ; VI: s_ashr_i32
19 ; VI: s_ashr_i32
20 ; VI: s_lshl_b32
21 ; VI: s_and_b32
22 ; VI: s_or_b32
14 ; CIVI-DAG: s_ashr_i32
15 ; CIVI-DAG: s_ashr_i32
16 ; CIVI-DAG: s_sext_i32_i16
17 ; CIVI-DAG: s_sext_i32_i16
18 ; CIVI-DAG: s_ashr_i32
19 ; CIVI-DAG: s_ashr_i32
20 ; CIVI-DAG: s_lshl_b32
21 ; CIVI: s_and_b32
22 ; CIVI: s_or_b32
2323
24 ; CI: s_ashr_i32
25 ; CI: s_and_b32
26 ; CI: s_lshr_b32
27 ; CI: s_sext_i32_i16
28 ; CI: s_ashr_i32
29 ; CI: s_ashr_i32
30 ; CI: s_lshl_b32
31 ; CI: s_and_b32
32 define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
24 define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, i32, <2 x i16> %lhs, i32, <2 x i16> %rhs) #0 {
3325 %result = ashr <2 x i16> %lhs, %rhs
3426 store <2 x i16> %result, <2 x i16> addrspace(1)* %out
3527 ret void
66 ; GFX9-NOT: m0
77 ; SICIVI-DAG: s_mov_b32 m0
88
9 ; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
10 ; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
11 ; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
12 ; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
9 ; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
10 ; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
11 ; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
12 ; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
1313 ; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
1414 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
1515 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
1616 ; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
1717 ; GCN: s_endpgm
18 define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
18 define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, [8 x i32], i32 addrspace(3)* %ptr, [8 x i32], i32 %swap) nounwind {
1919 %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
2020 %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
2121 %result = extractvalue { i32, i1 } %pair, 0
6969
7070
7171 ; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
72 ; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
72 ; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x12
7373 ; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
74 ; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
74 ; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x48
7575 ; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
7676 ; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
7777 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
7878 ; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
7979 ; GCN: s_endpgm
80 define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
80 define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, [8 x i32], i32 %swap) nounwind {
8181 %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
8282 %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
8383 %result = extractvalue { i32, i1 } %pair, 0
0 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s
1
2 ; FIXME: SGPR-to-SMEM requires an additional SGPR always to scavenge m0
3
4 ; ALL-LABEL: {{^}}max_9_sgprs:
5 ; ALL: SGPRBlocks: 1
6 ; ALL: NumSGPRsForWavesPerEU: 9
7 define amdgpu_kernel void @max_9_sgprs() #0 {
8 %one = load volatile i32, i32 addrspace(4)* undef
9 %two = load volatile i32, i32 addrspace(4)* undef
10 %three = load volatile i32, i32 addrspace(4)* undef
11 %four = load volatile i32, i32 addrspace(4)* undef
12 %five = load volatile i32, i32 addrspace(4)* undef
13 %six = load volatile i32, i32 addrspace(4)* undef
14 %seven = load volatile i32, i32 addrspace(4)* undef
15 %eight = load volatile i32, i32 addrspace(4)* undef
16 %nine = load volatile i32, i32 addrspace(4)* undef
17 %ten = load volatile i32, i32 addrspace(4)* undef
18 call void asm sideeffect "", "s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight)
19 store volatile i32 %one, i32 addrspace(1)* undef
20 store volatile i32 %two, i32 addrspace(1)* undef
21 store volatile i32 %three, i32 addrspace(1)* undef
22 store volatile i32 %four, i32 addrspace(1)* undef
23 store volatile i32 %five, i32 addrspace(1)* undef
24 store volatile i32 %six, i32 addrspace(1)* undef
25 store volatile i32 %seven, i32 addrspace(1)* undef
26 store volatile i32 %eight, i32 addrspace(1)* undef
27 store volatile i32 %nine, i32 addrspace(1)* undef
28 store volatile i32 %ten, i32 addrspace(1)* undef
29 ret void
30 }
31
32 attributes #0 = { nounwind "amdgpu-num-sgpr"="14" }
0 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s
21
32 ; If spilling to smem, additional registers are used for the resource
43 ; descriptor.
4
5 ; FIXME: Vectorization can increase required SGPR count beyond limit.
6 ; FIXME: SGPR-to-SMEM requires an additional SGPR always to scavenge m0
57
68 ; ALL-LABEL: {{^}}max_9_sgprs:
79
810 ; ALL: SGPRBlocks: 1
911 ; ALL: NumSGPRsForWavesPerEU: 9
10 define amdgpu_kernel void @max_9_sgprs(i32 addrspace(1)* %out1,
11
12 i32 addrspace(1)* %out2,
13 i32 addrspace(1)* %out3,
14 i32 addrspace(1)* %out4,
15 i32 addrspace(1)* %out5,
16 i32 %one, i32 %two, i32 %three, i32 %four, i32 %five) #0 {
17 store i32 %one, i32 addrspace(1)* %out1
18 store i32 %two, i32 addrspace(1)* %out2
19 store i32 %three, i32 addrspace(1)* %out3
20 store i32 %four, i32 addrspace(1)* %out4
21 store i32 %five, i32 addrspace(1)* %out5
12 define amdgpu_kernel void @max_9_sgprs() #0 {
13 %one = load volatile i32, i32 addrspace(4)* undef
14 %two = load volatile i32, i32 addrspace(4)* undef
15 %three = load volatile i32, i32 addrspace(4)* undef
16 %four = load volatile i32, i32 addrspace(4)* undef
17 %five = load volatile i32, i32 addrspace(4)* undef
18 %six = load volatile i32, i32 addrspace(4)* undef
19 %seven = load volatile i32, i32 addrspace(4)* undef
20 %eight = load volatile i32, i32 addrspace(4)* undef
21 %nine = load volatile i32, i32 addrspace(4)* undef
22 %ten = load volatile i32, i32 addrspace(4)* undef
23 call void asm sideeffect "", "s,s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight, i32 %nine)
24 store volatile i32 %one, i32 addrspace(1)* undef
25 store volatile i32 %two, i32 addrspace(1)* undef
26 store volatile i32 %three, i32 addrspace(1)* undef
27 store volatile i32 %four, i32 addrspace(1)* undef
28 store volatile i32 %five, i32 addrspace(1)* undef
29 store volatile i32 %six, i32 addrspace(1)* undef
30 store volatile i32 %seven, i32 addrspace(1)* undef
31 store volatile i32 %eight, i32 addrspace(1)* undef
32 store volatile i32 %nine, i32 addrspace(1)* undef
33 store volatile i32 %ten, i32 addrspace(1)* undef
2234 ret void
2335 }
2436
2828
2929 ; GCN-LABEL: {{^}}test_brcc_i1:
3030 ; GCN: s_load_dword [[VAL:s[0-9]+]]
31 ; GCNNOOPT: s_and_b32 s{{[0-9]+}}, 1, [[VAL]]
31 ; GCNNOOPT: s_mov_b32 [[ONE:s[0-9]+]], 1{{$}}
32 ; GCNNOOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], [[ONE]]
3233 ; GCNOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], 1
3334 ; GCN: s_cmp_eq_u32
3435 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
22
33 ; GCN-LABEL: {{^}}v_ubfe_sub_i32:
44 ; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
4747 }
4848
4949 ; GCN-LABEL: {{^}}s_ubfe_sub_i32:
50 ; GCN: s_load_dword [[SRC:s[0-9]+]]
51 ; GCN: s_load_dword [[WIDTH:s[0-9]+]]
52 ; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]]
53 ; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
50 ; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
51 ; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], s[[WIDTH]]
52 ; GCN: v_bfe_u32 v{{[0-9]+}}, s[[SRC]], 0, [[VWIDTH]]
5453 define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
5554 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
5655 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
6261 }
6362
6463 ; GCN-LABEL: {{^}}s_ubfe_sub_multi_use_shl_i32:
65 ; GCN: s_load_dword [[SRC:s[0-9]+]]
66 ; GCN: s_load_dword [[WIDTH:s[0-9]+]]
67 ; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
68 ; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
69 ; GCN-NEXT: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]]
64 ; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
65 ; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
66 ; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[SRC]], [[SUB]]
67 ; GCN: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]]
7068 define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
7169 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
7270 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
125123 }
126124
127125 ; GCN-LABEL: {{^}}s_sbfe_sub_i32:
128 ; GCN: s_load_dword [[SRC:s[0-9]+]]
129 ; GCN: s_load_dword [[WIDTH:s[0-9]+]]
130 ; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]]
131 ; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
126 ; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
127 ; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], s[[WIDTH]]
128 ; GCN: v_bfe_i32 v{{[0-9]+}}, s[[SRC]], 0, [[VWIDTH]]
132129 define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
133130 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
134131 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
140137 }
141138
142139 ; GCN-LABEL: {{^}}s_sbfe_sub_multi_use_shl_i32:
143 ; GCN: s_load_dword [[SRC:s[0-9]+]]
144 ; GCN: s_load_dword [[WIDTH:s[0-9]+]]
145 ; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
146 ; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
147 ; GCN-NEXT: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]]
140 ; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
141 ; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
142 ; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[SRC]], [[SUB]]
143 ; GCN: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]]
148144 define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
149145 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
150146 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
None ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -check-prefixes=GCN,FUNC %s
1 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefixes=GCN,FUNC %s
2 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck -check-prefixes=R600,FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FUNC %s
2 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefixes=R600,FUNC %s
33
44 ; BFI_INT Definition pattern from ISA docs
55 ; (y & x) | (z & ~x)
118118 ; FUNC-LABEL: {{^}}s_bitselect_i64_pat_0:
119119 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
120120 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
121 ; GCN: v_bfi_b32
122121 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
123 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
124 ; GCN: v_bfi_b32
122 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
123 ; GCN-DAG: v_bfi_b32
124 ; GCN-DAG: v_bfi_b32
125125 define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
126126 %and0 = and i64 %a, %b
127127 %not.a = xor i64 %a, -1
135135 ; FUNC-LABEL: {{^}}s_bitselect_i64_pat_1:
136136 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
137137 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
138 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
139 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
138140 ; GCN-DAG: v_bfi_b32
139 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
140 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
141 ; GCN: v_bfi_b32
141 ; GCN-DAG: v_bfi_b32
142142 define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
143143 %xor.0 = xor i64 %a, %mask
144144 %and = and i64 %xor.0, %b
154154 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
155155 ; GCN-DAG: v_bfi_b32
156156 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
157 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
158 ; GCN: v_bfi_b32
157 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
158 ; GCN-DAG: v_bfi_b32
159159 define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
160160 %xor.0 = xor i64 %a, %mask
161161 %and = and i64 %xor.0, %b
1010 ; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
1111 ; GCN: s_cbranch_vccnz
1212
13 ; GCN: one{{$}}
14 ; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
15 ; GCN: buffer_store_short
16 ; GCN: s_endpgm
13 ; SI: one{{$}}
14 ; SI: v_cvt_f16_f32_e32 v[[CVT:[0-9]+]], v[[A_F32]]
1715
18 ; GCN: two{{$}}
19 ; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
20 ; GCN: buffer_store_short v[[B_F16]]
21 ; GCN: s_endpgm
16 ; SI: two{{$}}
17 ; SI: v_cvt_f16_f32_e32 v[[CVT]], v[[B_F32]]
18
19 ; SI: one{{$}}
20 ; SI: buffer_store_short v[[CVT]]
21 ; SI: s_endpgm
22
23
24
25 ; VI: one{{$}}
26 ; VI: buffer_store_short v[[A_F16]]
27 ; VI: s_endpgm
28
29 ; VI: two{{$}}
30 ; VI: buffer_store_short v[[B_F16]]
31 ; VI: s_endpgm
2232 define amdgpu_kernel void @br_cc_f16(
2333 half addrspace(1)* %r,
2434 half addrspace(1)* %a,
2535 half addrspace(1)* %b) {
2636 entry:
27 %a.val = load half, half addrspace(1)* %a
28 %b.val = load half, half addrspace(1)* %b
37 %a.val = load volatile half, half addrspace(1)* %a
38 %b.val = load volatile half, half addrspace(1)* %b
2939 %fcmp = fcmp olt half %a.val, %b.val
3040 br i1 %fcmp, label %one, label %two
3141
489489
490490 ; GCN-LABEL: {{^}}long_branch_hang:
491491 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
492 ; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
492 ; GCN: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
493493 ; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
494494 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
495495
88 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
99 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
1010 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
11 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 6
11 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8
1212 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
1313 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
1414 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
2222 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
2323 ; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
2424 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
25 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 6
25 ; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8
2626 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
2727 ; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
2828 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
122122 }
123123
124124 ; FUNC-LABEL: {{^}}s_ctlz_i64:
125 ; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
125 ; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
126126 ; GCN-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}}
127127 ; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
128128 ; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
132132 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
133133 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
134134 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
135 define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
135 define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
136136 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
137137 store i64 %ctlz, i64 addrspace(1)* %out
138138 ret void
9797 }
9898
9999 ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64:
100 ; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
100 ; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
101101 ; GCN-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}}
102102 ; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
103103 ; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
107107 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
108108 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
109109 ; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
110 define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
110 define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
111111 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
112112 store i64 %ctlz, i64 addrspace(1)* %out
113113 ret void
304304 ; but there are some cases when the should be allowed.
305305
306306 ; FUNC-LABEL: {{^}}ctpop_i32_in_br:
307 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
308 ; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34
307 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x16
308 ; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x58
309309 ; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
310310 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
311311 ; GCN: buffer_store_dword [[RESULT]],
312312 ; GCN: s_endpgm
313313 ; EG: BCNT_INT
314 define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
314 define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, [8 x i32], i32 %cond) {
315315 entry:
316316 %tmp0 = icmp eq i32 %cond, 0
317317 br i1 %tmp0, label %if, label %else
307307 ; FUNC-LABEL: {{^}}ctpop_i16_in_br:
308308 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
309309 ; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34
310 ; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
310
311 ; GCN: s_and_b32 [[CTPOP_ARG:s[0-9]+]], [[VAL]], 0xffff
312 ; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[CTPOP_ARG]]
311313 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
312314 ; GCN: buffer_store_short [[RESULT]],
313315 ; GCN: s_endpgm
1212 declare i128 @llvm.ctpop.i128(i128) nounwind readnone
1313
1414 ; FUNC-LABEL: {{^}}s_ctpop_i64:
15 ; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
16 ; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
15 ; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
16 ; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
1717 ; GCN: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]]
1818 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
1919 ; GCN: buffer_store_dword [[VRESULT]],
2020 ; GCN: s_endpgm
21 define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
21 define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
2222 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
2323 %truncctpop = trunc i64 %ctpop to i32
2424 store i32 %truncctpop, i32 addrspace(1)* %out, align 4
5757 }
5858
5959 ; GCN-LABEL: {{^}}extract_vector_elt_v3f16:
60 ; SI: s_load_dword s
61 ; SI: s_load_dword s
62
63 ; GFX89: s_load_dwordx2
64 ; GFX89: s_load_dwordx2
60 ; GCN: s_load_dwordx2
61 ; GCN: s_load_dwordx2
6562
6663 ; GCN: buffer_store_short
6764 ; GCN: buffer_store_short
7774 ; FIXME: Why sometimes vector shift?
7875 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16:
7976 ; SI: s_load_dword s
80 ; SI: s_load_dword s
81 ; SI: s_load_dword s
77 ; SI: s_load_dwordx2 s
78 ; SI: s_load_dwordx2 s
8279
8380 ; GFX89: s_load_dwordx2 s
8481 ; GFX89: s_load_dwordx2 s
8683
8784
8885 ; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
89 ; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v
90
91 ; SI: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
86 ; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
9287
9388 ; GCN: {{buffer|global}}_store_short
9489 define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
2626 ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
2727 ; GCN: buffer_store_short [[VELT1]]
2828 ; GCN: ScratchSize: 0
29 define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %idx) #0 {
29 define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %idx) #0 {
3030 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
3131 %elt = extractelement <2 x i16> %vec, i32 %idx
3232 store i16 %elt, i16 addrspace(1)* %out, align 2
5757 }
5858
5959 ; GCN-LABEL: {{^}}extract_vector_elt_v3i16:
60 ; SI: s_load_dword s
61 ; SI: s_load_dwordx2 s
62 ; SI: s_load_dword s
63
64 ; GFX89: s_load_dwordx2
65 ; GFX89: s_load_dwordx2
60 ; GCN: s_load_dwordx2
61 ; GCN: s_load_dwordx2
6662
6763 ; GCN-NOT: {{buffer|flat|global}}_load
6864
7874 }
7975
8076 ; GCN-LABEL: {{^}}extract_vector_elt_v4i16:
81 ; SI: s_load_dword s
82 ; SI: s_load_dword s
77 ; SI: s_load_dwordx2
8378 ; SI: buffer_store_short
8479 ; SI: buffer_store_short
8580
9994
10095 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:
10196 ; SI: s_load_dword s
102 ; SI: s_load_dword s
103 ; SI: s_load_dword s
97 ; SI: s_load_dwordx2 s
98 ; SI: s_load_dwordx2 s
10499
105 ; GFX89-DAG: s_load_dwordx2
106 ; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x2c
107 ; GFX89-DAG: s_load_dword s
100 ; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x24
101 ; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x4c
102 ; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[0:1], 0x54
108103
109104 ; GCN-NOT: {{buffer|flat|global}}
110105
112107 ; SICI: buffer_store_short
113108 ; SICI: buffer_store_short
114109
115 ; SICI: buffer_load_ushort
116 ; SICI: buffer_store_short
117
118110 ; GFX9-NOT: s_pack_ll_b32_b16
119111 ; GFX9-NOT: s_pack_lh_b32_b16
120112
121113 ; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
122 ; GFX89: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LOAD0]]:[[LOAD1]]{{\]}}, s{{[0-9]+}}
123
114 ; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s
124115 ; GCN: {{buffer|global}}_store_short
125 define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
116 define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, [8 x i32], <3 x i16> %foo, i32 %idx) #0 {
126117 %p0 = extractelement <3 x i16> %foo, i32 %idx
127118 %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
128119 store i16 %p0, i16 addrspace(1)* %out
None ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
1 ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
0 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
22
33 ; GCN-LABEL: {{^}}extract_vector_elt_v1i8:
44 ; GCN: s_load_dword [[LOAD:s[0-9]+]]
1313 ; GCN-LABEL: {{^}}extract_vector_elt_v2i8:
1414 ; GCN: s_load_dword s
1515 ; GCN-NOT: {{flat|buffer|global}}
16 ; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
16 ; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
17 ; VI: v_lshrrev_b16_e64 v{{[0-9]+}}, 8, s{{[0-9]+}}
1718 ; GCN-NOT: {{flat|buffer|global}}
1819 ; GCN: buffer_store_byte
1920 ; GCN: buffer_store_byte
2122 %p0 = extractelement <2 x i8> %foo, i32 0
2223 %p1 = extractelement <2 x i8> %foo, i32 1
2324 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
24 store i8 %p1, i8 addrspace(1)* %out
25 store i8 %p0, i8 addrspace(1)* %out1
25 store volatile i8 %p1, i8 addrspace(1)* %out
26 store volatile i8 %p0, i8 addrspace(1)* %out1
2627 ret void
2728 }
2829
3738 %p0 = extractelement <3 x i8> %foo, i32 0
3839 %p1 = extractelement <3 x i8> %foo, i32 2
3940 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
40 store i8 %p1, i8 addrspace(1)* %out
41 store i8 %p0, i8 addrspace(1)* %out1
41 store volatile i8 %p1, i8 addrspace(1)* %out
42 store volatile i8 %p0, i8 addrspace(1)* %out1
4243 ret void
4344 }
4445
5354 %p0 = extractelement <4 x i8> %foo, i32 0
5455 %p1 = extractelement <4 x i8> %foo, i32 2
5556 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
56 store i8 %p1, i8 addrspace(1)* %out
57 store i8 %p0, i8 addrspace(1)* %out1
57 store volatile i8 %p1, i8 addrspace(1)* %out
58 store volatile i8 %p0, i8 addrspace(1)* %out1
5859 ret void
5960 }
6061
6162 ; GCN-LABEL: {{^}}extract_vector_elt_v8i8:
63 ; GCN-NOT: {{s|flat|buffer|global}}_load
6264 ; GCN: s_load_dword [[VAL:s[0-9]+]]
63 ; GCN-NOT: {{flat|buffer|global}}
65 ; GCN-NOT: {{s|flat|buffer|global}}_load
6466 ; GCN: s_lshr_b32 s{{[0-9]+}}, [[VAL]], 16
65 ; GCN-NOT: {{flat|buffer|global}}
66 ; GCN: buffer_store_byte
67 ; GCN: buffer_store_byte
68 define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
67 ; GCN-NOT: {{s|flat|buffer|global}}_load
68 ; GCN: buffer_store_byte
69 ; GCN: buffer_store_byte
70 define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
6971 %p0 = extractelement <8 x i8> %foo, i32 0
7072 %p1 = extractelement <8 x i8> %foo, i32 2
71 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
72 store i8 %p1, i8 addrspace(1)* %out
73 store i8 %p0, i8 addrspace(1)* %out1
73 store volatile i8 %p1, i8 addrspace(1)* null
74 store volatile i8 %p0, i8 addrspace(1)* null
7475 ret void
7576 }
7677
8687 %p0 = extractelement <16 x i8> %foo, i32 0
8788 %p1 = extractelement <16 x i8> %foo, i32 2
8889 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
89 store i8 %p1, i8 addrspace(1)* %out
90 store i8 %p0, i8 addrspace(1)* %out1
90 store volatile i8 %p1, i8 addrspace(1)* %out
91 store volatile i8 %p0, i8 addrspace(1)* %out1
9192 ret void
9293 }
9394
9495 ; GCN-LABEL: {{^}}extract_vector_elt_v32i8:
95 ; GCN: s_load_dword [[LOAD0:s[0-9]+]]
96 ; GCN-NOT: {{flat|buffer|global}}
97 ; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16
98 ; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]]
96 ; GCN-NOT: {{s|flat|buffer|global}}_load
97 ; GCN: s_load_dword [[VAL:s[0-9]+]]
98 ; GCN-NOT: {{s|flat|buffer|global}}_load
99 ; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[VAL]], 16
100 ; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], s{{[0-9]+}}
99101 ; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
100102 ; GCN: buffer_store_byte [[V_ELT2]]
101103 ; GCN: buffer_store_byte [[V_LOAD0]]
102 define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
104 define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
103105 %p0 = extractelement <32 x i8> %foo, i32 0
104106 %p1 = extractelement <32 x i8> %foo, i32 2
105 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
106 store i8 %p1, i8 addrspace(1)* %out
107 store i8 %p0, i8 addrspace(1)* %out1
107 store volatile i8 %p1, i8 addrspace(1)* null
108 store volatile i8 %p0, i8 addrspace(1)* null
108109 ret void
109110 }
110111
120121 %p0 = extractelement <64 x i8> %foo, i32 0
121122 %p1 = extractelement <64 x i8> %foo, i32 2
122123 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
123 store i8 %p1, i8 addrspace(1)* %out
124 store i8 %p0, i8 addrspace(1)* %out1
124 store volatile i8 %p1, i8 addrspace(1)* %out
125 store volatile i8 %p0, i8 addrspace(1)* %out1
125126 ret void
126127 }
127128
131132 ; isTypeDesirableForOp in SimplifyDemandedBits
132133
133134 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8:
134 ; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c
135 ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
135 ; VI: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28
136 ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c
136137 ; VI-NOT: {{flat|buffer|global}}
137 ; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8
138 ; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]]
139 ; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
140 ; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[ELT0]], [[ELT2]]
138 ; VI-DAG: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
141139 ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
142 ; VI: v_lshrrev_b16_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[BUILD_VEC]]
143 ; VI: buffer_store_byte [[EXTRACT]]
144 define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo, i32 %idx) #0 {
140 ; VI: v_lshrrev_b16_e32 [[ELT:v[0-9]+]], [[SCALED_IDX]], [[V_LOAD]]
141 ; VI: buffer_store_byte [[ELT]]
142 define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 {
145143 %elt = extractelement <2 x i8> %foo, i32 %idx
146 store i8 %elt, i8 addrspace(1)* %out
144 store volatile i8 %elt, i8 addrspace(1)* %out
147145 ret void
148146 }
149147
150148 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8:
151 ; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c
152 ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
149 ; VI: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28
150 ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c
153151 ; VI-NOT: {{flat|buffer|global}}
154 ; VI: s_lshr_b32 [[ELT12:s[0-9]+]], [[LOAD]], 8
155 ; VI: v_lshlrev_b16_e64 [[ELT1:v[0-9]+]], 8, [[ELT12]]
156 ; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
157 ; VI: v_or_b32_e32 [[VEC3:v[0-9]+]], [[ELT0]], [[ELT1]]
158152 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
159 ; VI: v_lshrrev_b32_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[VEC3]]
160 ; VI: buffer_store_byte [[EXTRACT]]
161 define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
153 ; VI: s_lshr_b32 [[ELT:s[0-9]+]], [[LOAD]], [[SCALED_IDX]]
154 ; VI: v_mov_b32_e32 [[V_ELT:v[0-9]+]], [[ELT]]
155 ; VI: buffer_store_byte [[V_ELT]]
156 define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 {
162157 %p0 = extractelement <3 x i8> %foo, i32 %idx
163158 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
164 store i8 %p0, i8 addrspace(1)* %out
159 store volatile i8 %p0, i8 addrspace(1)* %out
165160 ret void
166161 }
167162
168163 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8:
169 ; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34
164 ; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x30
170165 ; VI: s_load_dword [[VEC4:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
171166
172167 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
174169
175170 ; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], [[EXTRACT]]
176171 ; VI: buffer_store_byte [[V_EXTRACT]]
177 define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, i32 %idx) #0 {
172 define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, [8 x i32], i32 %idx) #0 {
178173 %vec = load <4 x i8>, <4 x i8> addrspace(4)* %vec.ptr
179174 %p0 = extractelement <4 x i8> %vec, i32 %idx
180175 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
181 store i8 %p0, i8 addrspace(1)* %out
176 store volatile i8 %p0, i8 addrspace(1)* %out
182177 ret void
183178 }
184179
185180 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8:
186 ; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34
181 ; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x10
187182 ; VI: s_load_dwordx2 [[VEC8:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
188183
189184 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
194189 %vec = load <8 x i8>, <8 x i8> addrspace(4)* %vec.ptr
195190 %p0 = extractelement <8 x i8> %vec, i32 %idx
196191 %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
197 store i8 %p0, i8 addrspace(1)* %out
192 store volatile i8 %p0, i8 addrspace(1)* %out
198193 ret void
199194 }
200195
3838 }
3939
4040 ; GCN-LABEL: {{^}}s_fabs_v4f16:
41 ; CI: s_load_dword s[[LO:[0-9]+]]
42 ; CI: s_load_dword s[[HI:[0-9]+]]
41 ; CI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2
4342 ; GFX89: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
43
4444 ; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff
4545 ; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], [[MASK]]
4646 ; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], [[MASK]]
5353
5454 ; GCN-LABEL: {{^}}fabs_fold_f16:
5555 ; GCN: s_load_dword [[IN0:s[0-9]+]]
56 ; GCN: s_lshr_b32 [[IN1:s[0-9]+]], [[IN0]], 16
56 ; GCN-DAG: s_lshr_b32 [[IN1:s[0-9]+]], [[IN0]], 16
5757
5858 ; CI-DAG: v_cvt_f32_f16_e64 [[CVT0:v[0-9]+]], |[[IN0]]|
5959 ; CI-DAG: v_cvt_f32_f16_e32 [[ABS_CVT1:v[0-9]+]], [[IN1]]
6161 ; CI-DAG: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]]
6262 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]]
6363
64 ; GFX89-NOT: and
6465 ; GFX89: v_mov_b32_e32 [[V_IN1:v[0-9]+]], [[IN1]]
6566 ; GFX89: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN0]]|, [[V_IN1]]
6667 ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
5252 }
5353
5454 ; SI-LABEL: {{^}}fabs_fold_f64:
55 ; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
55 ; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
5656 ; SI-NOT: and
5757 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
5858 ; SI: s_endpgm
59 define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
59 define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, [8 x i32], double %in0, [8 x i32], double %in1) {
6060 %fabs = call double @llvm.fabs.f64(double %in0)
6161 %fmul = fmul double %fabs, %in1
6262 store double %fmul, double addrspace(1)* %out
6464 }
6565
6666 ; SI-LABEL: {{^}}fabs_fn_fold_f64:
67 ; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
67 ; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
6868 ; SI-NOT: and
6969 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
7070 ; SI: s_endpgm
71 define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
71 define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, [8 x i32], double %in0, [8 x i32], double %in1) {
7272 %fabs = call double @fabs(double %in0)
7373 %fmul = fmul double %fabs, %in1
7474 store double %fmul, double addrspace(1)* %out
6969 }
7070
7171 ; GCN-LABEL: {{^}}fabs_fn_fold:
72 ; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
73 ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
72 ; SI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
73 ; VI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
7474 ; GCN-NOT: and
75 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
75 ; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[MUL_VAL]]
76 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[ABS_VALUE]]|, [[V_MUL_VI]]
7677 define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
7778 %fabs = call float @fabs(float %in0)
7879 %fmul = fmul float %fabs, %in1
8182 }
8283
8384 ; FUNC-LABEL: {{^}}fabs_fold:
84 ; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
85 ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
85 ; SI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
86 ; VI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
8687 ; GCN-NOT: and
87 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
88 ; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[MUL_VAL]]
89 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[ABS_VALUE]]|, [[V_MUL_VI]]
8890 define amdgpu_kernel void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
8991 %fabs = call float @llvm.fabs.f32(float %in0)
9092 %fmul = fmul float %fabs, %in1
1515 half addrspace(1)* %a,
1616 half addrspace(1)* %b) {
1717 entry:
18 %a.val = load half, half addrspace(1)* %a
19 %b.val = load half, half addrspace(1)* %b
18 %a.val = load volatile half, half addrspace(1)* %a
19 %b.val = load volatile half, half addrspace(1)* %b
2020 %r.val = fadd half %a.val, %b.val
2121 store half %r.val, half addrspace(1)* %r
2222 ret void
6464 ; VI: flat_load_dword v[[B_V2_F16:[0-9]+]]
6565 ; VI: flat_load_dword v[[A_V2_F16:[0-9]+]]
6666
67 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
68 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
69 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
70 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
67 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
68 ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
69 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
70 ; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
7171
7272 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
7373 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
101101
102102 ; GCN-LABEL: {{^}}fadd_v2f16_imm_a:
103103 ; GCN-DAG: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
104 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
105 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
106 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
107 ; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
108 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
109 ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
110 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
104 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
105 ; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
106 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
107 ; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
108 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
109 ; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
110 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
111111 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
112112 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
113113
132132
133133 ; GCN-LABEL: {{^}}fadd_v2f16_imm_b:
134134 ; GCN-DAG: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
135 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
135 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
136136 ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
137 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
138 ; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]]
139 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
140 ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]]
141 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
137 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
138 ; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]]
139 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
140 ; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]]
141 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
142142 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
143143 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
144144
1515 half addrspace(1)* %a,
1616 half addrspace(1)* %b) {
1717 entry:
18 %a.val = load half, half addrspace(1)* %a
19 %b.val = load half, half addrspace(1)* %b
18 %a.val = load volatile half, half addrspace(1)* %a
19 %b.val = load volatile half, half addrspace(1)* %b
2020 %r.val = fcmp olt half %a.val, %b.val
2121 %r.val.sext = sext i1 %r.val to i32
2222 store i32 %r.val.sext, i32 addrspace(1)* %r
4141 half addrspace(1)* %a,
4242 half addrspace(1)* %b) {
4343 entry:
44 %a.val = load half, half addrspace(1)* %a
45 %b.val = load half, half addrspace(1)* %b
44 %a.val = load volatile half, half addrspace(1)* %a
45 %b.val = load volatile half, half addrspace(1)* %b
4646 %a.abs = call half @llvm.fabs.f16(half %a.val)
4747 %b.abs = call half @llvm.fabs.f16(half %b.val)
4848 %r.val = fcmp olt half %a.abs, %b.abs
6666 half addrspace(1)* %a,
6767 half addrspace(1)* %b) {
6868 entry:
69 %a.val = load half, half addrspace(1)* %a
70 %b.val = load half, half addrspace(1)* %b
69 %a.val = load volatile half, half addrspace(1)* %a
70 %b.val = load volatile half, half addrspace(1)* %b
7171 %r.val = fcmp oeq half %a.val, %b.val
7272 %r.val.sext = sext i1 %r.val to i32
7373 store i32 %r.val.sext, i32 addrspace(1)* %r
8989 half addrspace(1)* %a,
9090 half addrspace(1)* %b) {
9191 entry:
92 %a.val = load half, half addrspace(1)* %a
93 %b.val = load half, half addrspace(1)* %b
92 %a.val = load volatile half, half addrspace(1)* %a
93 %b.val = load volatile half, half addrspace(1)* %b
9494 %r.val = fcmp ole half %a.val, %b.val
9595 %r.val.sext = sext i1 %r.val to i32
9696 store i32 %r.val.sext, i32 addrspace(1)* %r
112112 half addrspace(1)* %a,
113113 half addrspace(1)* %b) {
114114 entry:
115 %a.val = load half, half addrspace(1)* %a
116 %b.val = load half, half addrspace(1)* %b
115 %a.val = load volatile half, half addrspace(1)* %a
116 %b.val = load volatile half, half addrspace(1)* %b
117117 %r.val = fcmp ogt half %a.val, %b.val
118118 %r.val.sext = sext i1 %r.val to i32
119119 store i32 %r.val.sext, i32 addrspace(1)* %r
135135 half addrspace(1)* %a,
136136 half addrspace(1)* %b) {
137137 entry:
138 %a.val = load half, half addrspace(1)* %a
139 %b.val = load half, half addrspace(1)* %b
138 %a.val = load volatile half, half addrspace(1)* %a
139 %b.val = load volatile half, half addrspace(1)* %b
140140 %r.val = fcmp one half %a.val, %b.val
141141 %r.val.sext = sext i1 %r.val to i32
142142 store i32 %r.val.sext, i32 addrspace(1)* %r
158158 half addrspace(1)* %a,
159159 half addrspace(1)* %b) {
160160 entry:
161 %a.val = load half, half addrspace(1)* %a
162 %b.val = load half, half addrspace(1)* %b
161 %a.val = load volatile half, half addrspace(1)* %a
162 %b.val = load volatile half, half addrspace(1)* %b
163163 %r.val = fcmp oge half %a.val, %b.val
164164 %r.val.sext = sext i1 %r.val to i32
165165 store i32 %r.val.sext, i32 addrspace(1)* %r
181181 half addrspace(1)* %a,
182182 half addrspace(1)* %b) {
183183 entry:
184 %a.val = load half, half addrspace(1)* %a
185 %b.val = load half, half addrspace(1)* %b
184 %a.val = load volatile half, half addrspace(1)* %a
185 %b.val = load volatile half, half addrspace(1)* %b
186186 %r.val = fcmp ord half %a.val, %b.val
187187 %r.val.sext = sext i1 %r.val to i32
188188 store i32 %r.val.sext, i32 addrspace(1)* %r
204204 half addrspace(1)* %a,
205205 half addrspace(1)* %b) {
206206 entry:
207 %a.val = load half, half addrspace(1)* %a
208 %b.val = load half, half addrspace(1)* %b
207 %a.val = load volatile half, half addrspace(1)* %a
208 %b.val = load volatile half, half addrspace(1)* %b
209209 %r.val = fcmp uno half %a.val, %b.val
210210 %r.val.sext = sext i1 %r.val to i32
211211 store i32 %r.val.sext, i32 addrspace(1)* %r
227227 half addrspace(1)* %a,
228228 half addrspace(1)* %b) {
229229 entry:
230 %a.val = load half, half addrspace(1)* %a
231 %b.val = load half, half addrspace(1)* %b
230 %a.val = load volatile half, half addrspace(1)* %a
231 %b.val = load volatile half, half addrspace(1)* %b
232232 %r.val = fcmp ult half %a.val, %b.val
233233 %r.val.sext = sext i1 %r.val to i32
234234 store i32 %r.val.sext, i32 addrspace(1)* %r
250250 half addrspace(1)* %a,
251251 half addrspace(1)* %b) {
252252 entry:
253 %a.val = load half, half addrspace(1)* %a
254 %b.val = load half, half addrspace(1)* %b
253 %a.val = load volatile half, half addrspace(1)* %a
254 %b.val = load volatile half, half addrspace(1)* %b
255255 %r.val = fcmp ueq half %a.val, %b.val
256256 %r.val.sext = sext i1 %r.val to i32
257257 store i32 %r.val.sext, i32 addrspace(1)* %r
273273 half addrspace(1)* %a,
274274 half addrspace(1)* %b) {
275275 entry:
276 %a.val = load half, half addrspace(1)* %a
277 %b.val = load half, half addrspace(1)* %b
276 %a.val = load volatile half, half addrspace(1)* %a
277 %b.val = load volatile half, half addrspace(1)* %b
278278 %r.val = fcmp ule half %a.val, %b.val
279279 %r.val.sext = sext i1 %r.val to i32
280280 store i32 %r.val.sext, i32 addrspace(1)* %r
296296 half addrspace(1)* %a,
297297 half addrspace(1)* %b) {
298298 entry:
299 %a.val = load half, half addrspace(1)* %a
300 %b.val = load half, half addrspace(1)* %b
299 %a.val = load volatile half, half addrspace(1)* %a
300 %b.val = load volatile half, half addrspace(1)* %b
301301 %r.val = fcmp ugt half %a.val, %b.val
302302 %r.val.sext = sext i1 %r.val to i32
303303 store i32 %r.val.sext, i32 addrspace(1)* %r
319319 half addrspace(1)* %a,
320320 half addrspace(1)* %b) {
321321 entry:
322 %a.val = load half, half addrspace(1)* %a
323 %b.val = load half, half addrspace(1)* %b
322 %a.val = load volatile half, half addrspace(1)* %a
323 %b.val = load volatile half, half addrspace(1)* %b
324324 %r.val = fcmp une half %a.val, %b.val
325325 %r.val.sext = sext i1 %r.val to i32
326326 store i32 %r.val.sext, i32 addrspace(1)* %r
342342 half addrspace(1)* %a,
343343 half addrspace(1)* %b) {
344344 entry:
345 %a.val = load half, half addrspace(1)* %a
346 %b.val = load half, half addrspace(1)* %b
345 %a.val = load volatile half, half addrspace(1)* %a
346 %b.val = load volatile half, half addrspace(1)* %b
347347 %r.val = fcmp uge half %a.val, %b.val
348348 %r.val.sext = sext i1 %r.val to i32
349349 store i32 %r.val.sext, i32 addrspace(1)* %r
2929 half addrspace(1)* %arg_mag,
3030 half addrspace(1)* %arg_sign) {
3131 entry:
32 %mag = load half, half addrspace(1)* %arg_mag
33 %sign = load half, half addrspace(1)* %arg_sign
32 %mag = load volatile half, half addrspace(1)* %arg_mag
33 %sign = load volatile half, half addrspace(1)* %arg_sign
3434 %out = call half @llvm.copysign.f16(half %mag, half %sign)
3535 store half %out, half addrspace(1)* %arg_out
3636 ret void
77
88 ; Try to identify arg based on higher address.
99 ; FUNC-LABEL: {{^}}test_copysign_f32:
10 ; SI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb
11 ; SI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc
12 ; VI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0x2c
13 ; VI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0x30
14 ; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
15 ; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
10 ; SI: s_load_dwordx2 s{{\[}}[[SMAG:[0-9]+]]:[[SSIGN:[0-9]+]]{{\]}}, {{.*}} 0xb
11 ; VI: s_load_dwordx2 s{{\[}}[[SMAG:[0-9]+]]:[[SSIGN:[0-9]+]]{{\]}}, {{.*}} 0x2c
12
13 ; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], s[[SSIGN]]
14 ; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], s[[SMAG]]
1615 ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2
1716 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
1817 ; GCN: buffer_store_dword [[RESULT]],
55 declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone
66
77 ; FUNC-LABEL: {{^}}test_copysign_f64:
8 ; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
9 ; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
10 ; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
11 ; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
8 ; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
9 ; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x1d
10 ; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x4c
11 ; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x74
1212 ; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
1313 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
1414 ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2
1616 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
1717 ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
1818 ; GCN: s_endpgm
19 define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
19 define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, [8 x i32], double %mag, [8 x i32], double %sign) nounwind {
2020 %result = call double @llvm.copysign.f64(double %mag, double %sign)
2121 store double %result, double addrspace(1)* %out, align 8
2222 ret void
2323 }
2424
2525 ; FUNC-LABEL: {{^}}test_copysign_f64_f32:
26 ; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
27 ; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
26 ; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
27 ; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x4c
2828 ; GCN-DAG: s_load_dword s[[SSIGN:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}
2929 ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2{{$}}
3030 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
3232 ; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN]]
3333 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
3434 ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
35 define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind {
35 define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, [8 x i32], double %mag, float %sign) nounwind {
3636 %c = fpext float %sign to double
3737 %result = call double @llvm.copysign.f64(double %mag, double %c)
3838 store double %result, double addrspace(1)* %out, align 8
6363 ; SI: v_fma_f32
6464 ; SI: v_fma_f32
6565 ; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
66 ; GFX906: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+$}}
6766 ; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
6867 ; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
68 ; GFX906: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+$}}
6969
7070 ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].{{[XYZW][XYZW][XYZW][XYZW]}}, {{T[0-9]\.[XYZW]}},
7171 ; EG-DAG: FMA {{\*? *}}[[RES]].X
2424 }
2525
2626 ; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32:
27 ; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
28 ; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
29
30 ; SI-SAFE-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
31 ; SI-NONAN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
32
33 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[VA]]
34 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[VB]]
27 ; SI-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
28
29 ; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]]
30
31 ; SI-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]]
32 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]]
3533
3634 define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
3735 %cmp = fcmp ule float %a, %b
None ; XUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
0 ; XUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
33
44
55 ; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't
4343 ; GCN-DAG: buffer_store_dword [[MUL2]]
4444 ; GCN-DAG: buffer_store_dword [[MAD]]
4545 ; GCN: s_endpgm
46 define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 {
46 define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, [8 x i32], float %y) #0 {
4747 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
4848 %mul2 = fmul fast float %x, 2.0
4949 %mad = fadd fast float %mul2, %y
1616 half addrspace(1)* %a,
1717 half addrspace(1)* %b) {
1818 entry:
19 %a.val = load half, half addrspace(1)* %a
20 %b.val = load half, half addrspace(1)* %b
19 %a.val = load volatile half, half addrspace(1)* %a
20 %b.val = load volatile half, half addrspace(1)* %b
2121 %r.val = fmul half %a.val, %b.val
2222 store half %r.val, half addrspace(1)* %r
2323 ret void
3535 half addrspace(1)* %r,
3636 half addrspace(1)* %b) {
3737 entry:
38 %b.val = load half, half addrspace(1)* %b
38 %b.val = load volatile half, half addrspace(1)* %b
3939 %r.val = fmul half 3.0, %b.val
4040 store half %r.val, half addrspace(1)* %r
4141 ret void
5454 half addrspace(1)* %r,
5555 half addrspace(1)* %a) {
5656 entry:
57 %a.val = load half, half addrspace(1)* %a
57 %a.val = load volatile half, half addrspace(1)* %a
5858 %r.val = fmul half %a.val, 4.0
5959 store half %r.val, half addrspace(1)* %r
6060 ret void
6161 }
6262
6363 ; GCN-LABEL: {{^}}fmul_v2f16:
64 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
65 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
66
67 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
68 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
69 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
70 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
71 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
72 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
73 ; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
74 ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
64 ; SIVI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
65 ; SIVI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
66
67 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
68 ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
69 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
70 ; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
71 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
72 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
73 ; SI-DAG: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
74 ; SI-DAG: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
7575 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
7676 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
7777 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
8181 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
8282 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
8383
84 ; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
85 ; GFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
8486 ; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
8587
8688 ; GCN: buffer_store_dword v[[R_V2_F16]]
99101
100102 ; GCN-LABEL: {{^}}fmul_v2f16_imm_a:
101103 ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
102 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
103 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
104 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
105 ; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
106 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
107 ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
108 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
104 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
105 ; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
106 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
107 ; SI-DAG: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
108 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
109 ; SI-DAG: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
110 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
109111
110112
111113 ; VI-DAG: v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400
132134
133135 ; GCN-LABEL: {{^}}fmul_v2f16_imm_b:
134136 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
135 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
136 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
137 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
138 ; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
139 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
140 ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
141 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
137 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
138 ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
139 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
140 ; SI-DAG: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
141 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
142 ; SI-DAG: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
143 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
142144
143145 ; VI-DAG: v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200
144146 ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
163165 }
164166
165167 ; GCN-LABEL: {{^}}fmul_v4f16:
166 ; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
167 ; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
168 ; GFX9: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
169 ; GFX9: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
168170
169171 ; GFX9-DAG: v_pk_mul_f16 v[[MUL_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
170172 ; GFX9-DAG: v_pk_mul_f16 v[[MUL_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
171173 ; GFX9: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[MUL_HI]]{{\]}}
172174
175 ; VI: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
176 ; VI: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
173177 ; VI: v_mul_f16_sdwa
174178 ; VI: v_mul_f16_e32
175179 ; VI: v_mul_f16_sdwa
108108 }
109109
110110 ; GCN-LABEL: {{^}}fold_user_fneg_fabs_v2f16:
111 ; CI: s_load_dword s
112 ; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
111 ; CI: s_load_dword [[IN:s[0-9]+]]
112 ; CI: s_or_b32 [[FNEG_FABS:s[0-9]+]], [[IN]], 0x80008000
113 ; CI: s_lshr_b32
113114 ; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
114115 ; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
115116 ; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
5454 }
5555
5656 ; GCN-LABEL: {{^}}fneg_fabs_f64:
57 ; GCN-DAG: s_load_dwordx2
5857 ; GCN-DAG: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
59 ; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
60 ; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
58 ; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x13
59 ; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x4c
6160 ; GCN-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
6261 ; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
6362 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
64 define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
63 define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, [8 x i32], double %in) {
6564 %fabs = call double @llvm.fabs.f64(double %in)
6665 %fsub = fsub double -0.000000e+00, %fabs
6766 store double %fsub, double addrspace(1)* %out, align 8
33
44 ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
55 ; SI-NOT: and
6 ; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}|
6 ; SI: v_sub_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{s[0-9]+}}|
77 define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
88 %fabs = call float @llvm.fabs.f32(float %x)
99 %fsub = fsub float -0.000000e+00, %fabs
1414
1515 ; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32:
1616 ; SI-NOT: and
17 ; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}|
17 ; SI: v_mul_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{s[0-9]+}}|
1818 ; SI-NOT: and
1919 define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
2020 %fabs = call float @llvm.fabs.f32(float %x)
4747 }
4848
4949 ; GCN-LABEL: {{^}}fneg_fold_f64:
50 ; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
51 ; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
50 ; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
51 ; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
5252 ; GCN-NOT: xor
5353 ; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
54 define amdgpu_kernel void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
54 define amdgpu_kernel void @fneg_fold_f64(double addrspace(1)* %out, [8 x i32], double %in) {
5555 %fsub = fsub double -0.0, %in
5656 %fmul = fmul double %fsub, %in
5757 store double %fmul, double addrspace(1)* %out
1212 define amdgpu_kernel void @f(i32 addrspace(1)* nocapture %a, i32 %i, i32 %j) local_unnamed_addr #0 {
1313 entry:
1414 ; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9
15 ; CHECK: s_load_dword s2, s[0:1], 0xb
16 ; CHECK: s_load_dword s0, s[0:1], 0xc
15 ; CHECK: s_load_dwordx2 s[0:1], s[0:1], 0xb
1716 ; CHECK: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1817 ; CHECK: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1918 ; CHECK: s_mov_b32 s10, -1
19 ; CHECK: v_mov_b32_e32 v0, 4
2020 ; CHECK: s_waitcnt lgkmcnt(0)
21 ; CHECK: s_lshl_b32 s1, s2, 2
22 ; CHECK: v_mov_b32_e32 v0, 4
21 ; CHECK: s_lshl_b32 s0, s0, 2
22 ; CHECK: v_add_i32_e32 v1, vcc, s0, v0
23 ; CHECK: s_lshl_b32 s0, s1, 2
2324 ; CHECK: s_mov_b32 s11, 0xe8f000
24 ; CHECK: v_add_i32_e32 v1, vcc, s1, v0
2525 ; CHECK: v_mov_b32_e32 v2, 7
26 ; CHECK: s_lshl_b32 s0, s0, 2
2726 ; CHECK: buffer_store_dword v2, v1, s[8:11], s3 offen
2827 ; CHECK: v_add_i32_e32 v0, vcc, s0, v0
2928 ; CHECK: s_mov_b32 s7, 0xf000
3433 ; CHECK: s_endpgm
3534
3635 %x = alloca [100 x i32], align 4, addrspace(5)
37 %0 = bitcast [100 x i32] addrspace(5)* %x to i8 addrspace(5)*
38 call void @llvm.lifetime.start.p5i8(i64 400, i8 addrspace(5)* nonnull %0) #0
36 %alloca.bc = bitcast [100 x i32] addrspace(5)* %x to i8 addrspace(5)*
37 call void @llvm.lifetime.start.p5i8(i64 400, i8 addrspace(5)* nonnull %alloca.bc) #0
3938 %arrayidx = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %i
4039 store i32 7, i32 addrspace(5)* %arrayidx, align 4
4140 %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %j
42 %1 = load i32, i32 addrspace(5)* %arrayidx2, align 4
43 store i32 %1, i32 addrspace(1)* %a, align 4
44 call void @llvm.lifetime.end.p5i8(i64 400, i8 addrspace(5)* nonnull %0) #0
41 %ld = load i32, i32 addrspace(5)* %arrayidx2, align 4
42 store i32 %ld, i32 addrspace(1)* %a, align 4
43 call void @llvm.lifetime.end.p5i8(i64 400, i8 addrspace(5)* nonnull %alloca.bc) #0
4544 ret void
4645 }
4746
1616 half addrspace(1)* %a,
1717 half addrspace(1)* %b) {
1818 entry:
19 %a.val = load half, half addrspace(1)* %a
20 %b.val = load half, half addrspace(1)* %b
19 %a.val = load volatile half, half addrspace(1)* %a
20 %b.val = load volatile half, half addrspace(1)* %b
2121 %r.val = fsub half %a.val, %b.val
2222 store half %r.val, half addrspace(1)* %r
2323 ret void
3535 half addrspace(1)* %r,
3636 half addrspace(1)* %b) {
3737 entry:
38 %b.val = load half, half addrspace(1)* %b
38 %b.val = load volatile half, half addrspace(1)* %b
3939 %r.val = fsub half 1.0, %b.val
4040 store half %r.val, half addrspace(1)* %r
4141 ret void
5353 half addrspace(1)* %r,
5454 half addrspace(1)* %a) {
5555 entry:
56 %a.val = load half, half addrspace(1)* %a
56 %a.val = load volatile half, half addrspace(1)* %a
5757 %r.val = fsub half %a.val, 2.0
5858 store half %r.val, half addrspace(1)* %r
5959 ret void
6060 }
6161
6262 ; GCN-LABEL: {{^}}fsub_v2f16:
63 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
64 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
65 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
66 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
67 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
68 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
63 ; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
64 ; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
65
66 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
67 ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
68 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
69 ; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
6970
7071 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
7172 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
72 ; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
73 ; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
73 ; SI-DAG: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
74 ; SI-DAG: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
7475 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
7576 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
7677 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
7778 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
7879
80 ; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
81 ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
82
7983 ; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
8084 ; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
8185 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
86
87
88 ; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
89 ; GFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
8290
8391 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
8492
100108 ; GCN-LABEL: {{^}}fsub_v2f16_imm_a:
101109 ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
102110
103 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
104 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
105 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
106 ; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
107 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
108 ; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
109 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
111 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
112 ; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
113 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
114 ; SI-DAG: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
115 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
116 ; SI-DAG: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
117 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
110118 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
111119 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
112120
134142 ; GCN-LABEL: {{^}}fsub_v2f16_imm_b:
135143 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
136144
137 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
138 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
139 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
140 ; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]]
141 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
142 ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
143 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
145 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
146 ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
147 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
148 ; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]]
149 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
150 ; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
151 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
144152 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
145153 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
146154
44 ; CHECK: s_load_dwordx4
55 ; CHECK-NOT: flat_load_dword
66
7 define amdgpu_kernel void @uniform_load(float addrspace(1)* %arg, float addrspace(1)* %arg1) {
7 define amdgpu_kernel void @uniform_load(float addrspace(1)* %arg, [8 x i32], float addrspace(1)* %arg1) {
88 bb:
99 %tmp2 = load float, float addrspace(1)* %arg, align 4, !tbaa !8
1010 %tmp3 = fadd float %tmp2, 0.000000e+00
2727 ; CHECK: flat_load_dword
2828 ; CHECK-NOT: s_load_dwordx4
2929
30 define amdgpu_kernel void @non-uniform_load(float addrspace(1)* %arg, float addrspace(1)* %arg1) #0 {
30 define amdgpu_kernel void @non-uniform_load(float addrspace(1)* %arg, [8 x i32], float addrspace(1)* %arg1) #0 {
3131 bb:
3232 %tmp = call i32 @llvm.amdgcn.workitem.id.x() #1
3333 %tmp2 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp
5858 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
5959 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
6060
61 define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) {
61 define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, [8 x i32], i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1) {
6262 store i32 0, i32 addrspace(1)* %out0
6363 %val = load i32, i32 addrspace(1)* %in
6464 store i32 %val, i32 addrspace(1)* %out1
7070 ; CHECK: flat_store_dword
7171 ; CHECK: flat_load_dword [[VVAL:v[0-9]+]]
7272 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
73 define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) {
73 define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, [8 x i32], i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1) {
7474 store i32 0, i32 addrspace(1)* %out0
7575 %val = load i32, i32 addrspace(1)* %in
7676 store i32 %val, i32 addrspace(1)* %out1
7979
8080 ; uniform load from global array
8181 ; CHECK-LABEL: @global_array
82 ; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]]
82 ; CHECK: s_getpc_b64 [[GET_PC:s\[[0-9]+:[0-9]+\]]]
83 ; CHECK: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0
84 ; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]], [[GET_PC]], 0x0
8385 ; CHECK: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0
8486 ; CHECK: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0
8587 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
8688 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
87
8889 @A = common local_unnamed_addr addrspace(1) global i32 addrspace(1)* null, align 4
8990
9091 define amdgpu_kernel void @global_array(i32 addrspace(1)* nocapture %out) {
9192 entry:
92 %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
93 %1 = load i32, i32 addrspace(1)* %0, align 4
94 store i32 %1, i32 addrspace(1)* %out, align 4
93 %load0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
94 %load1 = load i32, i32 addrspace(1)* %load0, align 4
95 store i32 %load1, i32 addrspace(1)* %out, align 4
9596 ret void
9697 }
9798
104105 ; CHECK: flat_load_dwordx2 [[A_ADDR:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}}
105106 ; CHECK: flat_load_dword [[VVAL:v[0-9]+]], [[A_ADDR]]
106107 ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
107 define amdgpu_kernel void @global_array_alias_store(i32 addrspace(1)* nocapture %out, i32 %n) {
108 define amdgpu_kernel void @global_array_alias_store(i32 addrspace(1)* nocapture %out, [8 x i32], i32 %n) {
108109 entry:
109110 %gep = getelementptr i32, i32 addrspace(1) * %out, i32 %n
110111 store i32 12, i32 addrspace(1) * %gep
111 %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
112 %1 = load i32, i32 addrspace(1)* %0, align 4
113 store i32 %1, i32 addrspace(1)* %out, align 4
112 %load0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
113 %load1 = load i32, i32 addrspace(1)* %load0, align 4
114 store i32 %load1, i32 addrspace(1)* %out, align 4
114115 ret void
115116 }
116117
2121 }
2222
2323 ; GCN-LABEL: {{^}}load_v3f16_arg:
24 ; SI: s_load_dwordx2
25 ; SI: s_load_dword s
26 ; SI: s_load_dword s
27
28 ; VI: s_load_dwordx2
29 ; VI: s_load_dwordx2
30
24 ; GCN: s_load_dwordx2
25 ; GCN: s_load_dwordx2
3126 ; GCN-NOT: {buffer|flat|global}}_load_
3227
3328
4439
4540 ; FIXME: Why not one load?
4641 ; GCN-LABEL: {{^}}load_v4f16_arg:
47 ; SI-DAG: s_load_dword s[[ARG0_LO:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2
48 ; SI-DAG: s_load_dword s[[ARG0_HI:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x3
49
50 ; VI: s_load_dwordx2 s{{\[}}[[ARG0_LO:[0-9]+]]:[[ARG0_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
51
42 ; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG0_LO:[0-9]+]]:[[ARG0_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x2|0x8}}
5243 ; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], s[[ARG0_LO]]
5344 ; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], s[[ARG0_HI]]
5445 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
8576 }
8677
8778 ; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg:
88 ; SI: s_load_dwordx2 s
89 ; SI: s_load_dword s
90 ; SI: s_load_dword s
91
92 ; VI: s_load_dwordx2
93 ; VI: s_load_dwordx2
94 ; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
95
79 ; GCN: s_load_dwordx2 s
80 ; GCN: s_load_dwordx2 s
9681 ; GCN-NOT: _load
9782 ; GCN: v_cvt_f32_f16_e32
9883 ; GCN: v_cvt_f32_f16_e32
115100 }
116101
117102 ; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
118 ; SI: s_load_dword s
119 ; SI: s_load_dword s
120 ; SI: s_load_dword s
121 ; SI: s_load_dword s
122
123 ; VI: s_load_dwordx2 s
124 ; VI: s_load_dwordx2 s
125 ; VI: s_load_dwordx2 s
103 ; GCN: s_load_dwordx4
126104
127105 ; GCN: v_cvt_f32_f16_e32
128106 ; GCN: v_cvt_f32_f16_e32
153131 }
154132
155133 ; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
156 ; GCN: s_load_dword
134 ; GCN-DAG: s_load_dword s
157135 ; GCN: s_lshr_b32
158136
159137 ; GCN-DAG: v_cvt_f32_f16_e32
168146 }
169147
170148 ; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg:
171 ; SI: s_load_dword
172 ; SI: s_load_dword
173
174 ; VI: s_load_dwordx2
175 ; VI: s_load_dwordx2
176
177 ; GCN: s_lshr_b32
178
149 ; GCN: s_load_dwordx2 s
150 ; GCN: s_load_dwordx2 s
179151 ; GCN-DAG: v_cvt_f32_f16_e32
180152 ; GCN-DAG: v_cvt_f32_f16_e32
181153 ; GCN-DAG: v_cvt_f32_f16_e32
190162 }
191163
192164 ; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
193 ; SI: s_load_dword s
194 ; SI: s_load_dword s
195
196 ; VI: s_load_dwordx2 s
197
198 ; GCN-DAG: v_cvt_f32_f16_e32
199 ; GCN-DAG: v_cvt_f32_f16_e32
200 ; GCN-DAG: v_cvt_f32_f16_e32
201 ; GCN-DAG: v_cvt_f32_f16_e32
202 ; GCN-DAG: v_cvt_f64_f32_e32
203 ; GCN-DAG: v_cvt_f64_f32_e32
204 ; GCN-DAG: v_cvt_f64_f32_e32
205 ; GCN-DAG: v_cvt_f64_f32_e32
165 ; GCN: s_load_dwordx2 s
166 ; GCN: s_load_dwordx2 s
167
168 ; GCN: v_cvt_f32_f16_e32
169 ; GCN: v_cvt_f32_f16_e32
170 ; GCN: v_cvt_f32_f16_e32
171 ; GCN: v_cvt_f32_f16_e32
172 ; GCN: v_cvt_f64_f32_e32
173 ; GCN: v_cvt_f64_f32_e32
174 ; GCN: v_cvt_f64_f32_e32
175 ; GCN: v_cvt_f64_f32_e32
206176 ; GCN: s_endpgm
207177 define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
208178 %ext = fpext <4 x half> %arg to <4 x double>
211181 }
212182
213183 ; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
214 ; SI: s_load_dword s
215 ; SI-NEXT: s_load_dword s
216 ; SI-NEXT: s_load_dword s
217 ; SI-NEXT: s_load_dword s
218 ; SI-NOT: _load_
219
220 ; VI: s_load_dwordx2 s
221 ; VI: s_load_dwordx2 s
184 ; GCN: s_load_dwordx2 s
185 ; GCN: s_load_dwordx4 s
222186
223187 ; GCN-DAG: v_cvt_f32_f16_e32
224188 ; GCN-DAG: v_cvt_f32_f16_e32
298262 ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
299263 ; GCN: flat_load_dword [[LOAD:v[0-9]+]],
300264
301 ; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
302 ; SI: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
265 ; SI-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
266 ; SI-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
267
303268 ; SI: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
304269
270 ; VI: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
305271 ; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
306 ; VI: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
307272
308273 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
309274 ; GCN: s_endpgm
347312 ; SI: v_cvt_f32_f16_e32
348313 ; SI: v_cvt_f32_f16_e32
349314 ; SI: v_cvt_f32_f16_e32
315 ; SI: v_cvt_f32_f16_e32
350316
351317 ; GCN: flat_store_dwordx4
352318
353 ; SI: v_cvt_f32_f16_e32
354319 ; SI: v_cvt_f32_f16_e32
355320 ; SI: v_cvt_f32_f16_e32
356321 ; SI: v_cvt_f32_f16_e32
429394 ; XVI-NOT: v_cvt_f32_f16
430395
431396 ; GCN: flat_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]
432 ; GCN-DAG: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]]
433 ; GCN-DAG: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]]
434 ; SI-DAG: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
435 ; SI-DAG: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]]
436 ; VI-DAG: v_cvt_f32_f16_sdwa [[Y32:v[0-9]+]], v[[IN_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
437
438 ; GCN-DAG: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]]
439 ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]]
440 ; GCN-DAG: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]]
397 ; GCN: v_cvt_f32_f16_e32
398 ; GCN: v_cvt_f32_f16_e32
399 ; SI: v_cvt_f32_f16_e32
400 ; VI: v_cvt_f32_f16_sdwa
401 ; GCN-NOT: v_cvt_f32_f16
402
403 ; GCN: v_cvt_f64_f32_e32
404 ; GCN: v_cvt_f64_f32_e32
405 ; GCN: v_cvt_f64_f32_e32
441406 ; GCN-NOT: v_cvt_f64_f32_e32
442407
443 ; GCN-DAG: flat_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[XLO]]:[[YHI]]{{\]}}
444 ; GCN-DAG: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[Z]]
408 ; GCN-DAG: flat_store_dwordx4
409 ; GCN-DAG: flat_store_dwordx2
445410 ; GCN: s_endpgm
446411 define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
447412 %val = load <3 x half>, <3 x half> addrspace(1)* %in
77 ; CHECK: Version: [ 1, 0 ]
88 ; CHECK: Kernels:
99
10 ; CHECK: - Name: test
10 ; CHECK-LABEL: - Name: test
1111 ; CHECK: SymbolName: 'test@kd'
1212 ; CHECK: CodeProps:
1313 ; CHECK: KernargSegmentSize: 24
1515 ; CHECK: PrivateSegmentFixedSize: 0
1616 ; CHECK: KernargSegmentAlign: 8
1717 ; CHECK: WavefrontSize: 64
18 ; CHECK: NumSGPRs: 6
19 ; CHECK: NumVGPRs: 3
18 ; CHECK: NumSGPRs: 8
19 ; CHECK: NumVGPRs: 6
2020 ; CHECK: MaxFlatWorkGroupSize: 256
2121 define amdgpu_kernel void @test(
2222 half addrspace(1)* %r,
3030 ret void
3131 }
3232
33 ; CHECK: - Name: num_spilled_sgprs
33 ; CHECK-LABEL: - Name: num_spilled_sgprs
3434 ; CHECK: SymbolName: 'num_spilled_sgprs@kd'
3535 ; CHECK: CodeProps:
36 ; CHECK: NumSpilledSGPRs: 41
36 ; GFX700: NumSpilledSGPRs: 40
37 ; GFX803: NumSpilledSGPRs: 24
38 ; GFX900: NumSpilledSGPRs: 24
3739 define amdgpu_kernel void @num_spilled_sgprs(
38 i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %out2,
39 i32 addrspace(1)* %out3, i32 addrspace(1)* %out4, i32 addrspace(1)* %out5,
40 i32 addrspace(1)* %out6, i32 addrspace(1)* %out7, i32 addrspace(1)* %out8,
41 i32 addrspace(1)* %out9, i32 addrspace(1)* %outa, i32 addrspace(1)* %outb,
42 i32 addrspace(1)* %outc, i32 addrspace(1)* %outd, i32 addrspace(1)* %oute,
43 i32 addrspace(1)* %outf, i32 %in0, i32 %in1, i32 %in2, i32 %in3, i32 %in4,
44 i32 %in5, i32 %in6, i32 %in7, i32 %in8, i32 %in9, i32 %ina, i32 %inb,
40 i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32],
41 i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, [8 x i32],
42 i32 addrspace(1)* %out4, i32 addrspace(1)* %out5, [8 x i32],
43 i32 addrspace(1)* %out6, i32 addrspace(1)* %out7, [8 x i32],
44 i32 addrspace(1)* %out8, i32 addrspace(1)* %out9, [8 x i32],
45 i32 addrspace(1)* %outa, i32 addrspace(1)* %outb, [8 x i32],
46 i32 addrspace(1)* %outc, i32 addrspace(1)* %outd, [8 x i32],
47 i32 addrspace(1)* %oute, i32 addrspace(1)* %outf, [8 x i32],
48 i32 %in0, i32 %in1, i32 %in2, i32 %in3, [8 x i32],
49 i32 %in4, i32 %in5, i32 %in6, i32 %in7, [8 x i32],
50 i32 %in8, i32 %in9, i32 %ina, i32 %inb, [8 x i32],
4551 i32 %inc, i32 %ind, i32 %ine, i32 %inf) #0 {
4652 entry:
4753 store i32 %in0, i32 addrspace(1)* %out0
6369 ret void
6470 }
6571
66 ; CHECK: - Name: num_spilled_vgprs
72 ; CHECK-LABEL: - Name: num_spilled_vgprs
6773 ; CHECK: SymbolName: 'num_spilled_vgprs@kd'
6874 ; CHECK: CodeProps:
6975 ; CHECK: NumSpilledVGPRs: 14
343343
344344
345345 ; GCN-LABEL: {{^}}add_inline_imm_0.0_f64:
346 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
347 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
346 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
347 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
348348 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}}
349349 ; GCN: buffer_store_dwordx2 [[REG]]
350 define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
350 define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
351351 %y = fadd double %x, 0.0
352352 store double %y, double addrspace(1)* %out
353353 ret void
354354 }
355355
356356 ; GCN-LABEL: {{^}}add_inline_imm_0.5_f64:
357 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
358 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
357 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
358 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
359359 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.5
360360 ; GCN: buffer_store_dwordx2 [[REG]]
361 define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
361 define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, [8 x i32], double %x) {
362362 %y = fadd double %x, 0.5
363363 store double %y, double addrspace(1)* %out
364364 ret void
365365 }
366366
367367 ; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_f64:
368 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
369 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
368 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
369 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
370370 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -0.5
371371 ; GCN: buffer_store_dwordx2 [[REG]]
372 define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
372 define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, [8 x i32], double %x) {
373373 %y = fadd double %x, -0.5
374374 store double %y, double addrspace(1)* %out
375375 ret void
376376 }
377377
378378 ; GCN-LABEL: {{^}}add_inline_imm_1.0_f64:
379 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
380 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
379 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
380 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
381381 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1.0
382382 ; GCN: buffer_store_dwordx2 [[REG]]
383 define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
383 define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
384384 %y = fadd double %x, 1.0
385385 store double %y, double addrspace(1)* %out
386386 ret void
387387 }
388388
389389 ; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_f64:
390 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
391 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
390 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
391 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
392392 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1.0
393393 ; GCN: buffer_store_dwordx2 [[REG]]
394 define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
394 define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
395395 %y = fadd double %x, -1.0
396396 store double %y, double addrspace(1)* %out
397397 ret void
398398 }
399399
400400 ; GCN-LABEL: {{^}}add_inline_imm_2.0_f64:
401 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
402 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
401 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
402 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
403403 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2.0
404404 ; GCN: buffer_store_dwordx2 [[REG]]
405 define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
405 define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
406406 %y = fadd double %x, 2.0
407407 store double %y, double addrspace(1)* %out
408408 ret void
409409 }
410410
411411 ; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_f64:
412 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
413 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
412 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
413 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
414414 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2.0
415415 ; GCN: buffer_store_dwordx2 [[REG]]
416 define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
416 define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
417417 %y = fadd double %x, -2.0
418418 store double %y, double addrspace(1)* %out
419419 ret void
420420 }
421421
422422 ; GCN-LABEL: {{^}}add_inline_imm_4.0_f64:
423 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
424 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
423 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
424 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
425425 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 4.0
426426 ; GCN: buffer_store_dwordx2 [[REG]]
427 define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
427 define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
428428 %y = fadd double %x, 4.0
429429 store double %y, double addrspace(1)* %out
430430 ret void
431431 }
432432
433433 ; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_f64:
434 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
435 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
434 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
435 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
436436 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -4.0
437437 ; GCN: buffer_store_dwordx2 [[REG]]
438 define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
438 define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
439439 %y = fadd double %x, -4.0
440440 store double %y, double addrspace(1)* %out
441441 ret void
442442 }
443443
444444 ; GCN-LABEL: {{^}}add_inline_imm_inv_2pi_f64:
445 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
445 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
446446 ; SI-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
447447 ; SI-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fc45f30
448448 ; SI: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
449449
450 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
450 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
451451 ; VI: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.15915494{{$}}
452452 ; VI: buffer_store_dwordx2 [[REG]]
453 define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) {
453 define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, [8 x i32], double %x) {
454454 %y = fadd double %x, 0x3fc45f306dc9c882
455455 store double %y, double addrspace(1)* %out
456456 ret void
460460 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
461461 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30
462462 ; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
463 define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) {
463 define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, [8 x i32], double %x) {
464464 %y = fadd double %x, 0xbfc45f306dc9c882
465465 store double %y, double addrspace(1)* %out
466466 ret void
467467 }
468468
469469 ; GCN-LABEL: {{^}}add_inline_imm_1_f64:
470 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
471 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
470 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
471 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
472472 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1{{$}}
473473 ; GCN: buffer_store_dwordx2 [[REG]]
474 define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
474 define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, [8 x i32], double %x) {
475475 %y = fadd double %x, 0x0000000000000001
476476 store double %y, double addrspace(1)* %out
477477 ret void
478478 }
479479
480480 ; GCN-LABEL: {{^}}add_inline_imm_2_f64:
481 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
482 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
481 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
482 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
483483 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2{{$}}
484484 ; GCN: buffer_store_dwordx2 [[REG]]
485 define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
485 define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, [8 x i32], double %x) {
486486 %y = fadd double %x, 0x0000000000000002
487487 store double %y, double addrspace(1)* %out
488488 ret void
489489 }
490490
491491 ; GCN-LABEL: {{^}}add_inline_imm_16_f64:
492 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
493 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
492 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
493 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
494494 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 16
495495 ; GCN: buffer_store_dwordx2 [[REG]]
496 define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
496 define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, [8 x i32], double %x) {
497497 %y = fadd double %x, 0x0000000000000010
498498 store double %y, double addrspace(1)* %out
499499 ret void
503503 ; GCN: v_mov_b32_e32 v0, -1
504504 ; GCN: v_mov_b32_e32 v1, v0
505505 ; GCN: buffer_store_dwordx2 v[0:1]
506 define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
506 define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, [8 x i32], double %x) {
507507 %y = fadd double %x, 0xffffffffffffffff
508508 store double %y, double addrspace(1)* %out
509509 ret void
513513 ; GCN: v_mov_b32_e32 v0, -2
514514 ; GCN: v_mov_b32_e32 v1, -1
515515 ; GCN: buffer_store_dwordx2 v[0:1]
516 define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
516 define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, [8 x i32], double %x) {
517517 %y = fadd double %x, 0xfffffffffffffffe
518518 store double %y, double addrspace(1)* %out
519519 ret void
523523 ; GCN: v_mov_b32_e32 v0, -16
524524 ; GCN: v_mov_b32_e32 v1, -1
525525 ; GCN: buffer_store_dwordx2 v[0:1]
526 define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
526 define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, [8 x i32], double %x) {
527527 %y = fadd double %x, 0xfffffffffffffff0
528528 store double %y, double addrspace(1)* %out
529529 ret void
530530 }
531531
532532 ; GCN-LABEL: {{^}}add_inline_imm_63_f64:
533 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
534 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
533 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
534 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
535535 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 63
536536 ; GCN: buffer_store_dwordx2 [[REG]]
537 define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
537 define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, [8 x i32], double %x) {
538538 %y = fadd double %x, 0x000000000000003F
539539 store double %y, double addrspace(1)* %out
540540 ret void
541541 }
542542
543543 ; GCN-LABEL: {{^}}add_inline_imm_64_f64:
544 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
545 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
544 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
545 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
546546 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 64
547547 ; GCN: buffer_store_dwordx2 [[REG]]
548 define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
548 define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, [8 x i32], double %x) {
549549 %y = fadd double %x, 0x0000000000000040
550550 store double %y, double addrspace(1)* %out
551551 ret void
309309 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5
310310 ; GFX9: buffer_store_dword [[REG]]
311311
312 ; VI: buffer_load_dword
312 ; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
313 ; VI-DAG: buffer_load_dword
313314 ; VI-NOT: and
314 ; VI: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
315315 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
316316 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
317317 ; VI: v_or_b32
None ; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tahiti -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s
1 ; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s
0 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s
1 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s
22
33 ; FIXME: Broken on evergreen
44 ; FIXME: For some reason the 8 and 16 vectors are being stored as
7474
7575 ; GCN-LABEL: {{^}}insertelement_to_sgpr:
7676 ; GCN-NOT: v_readfirstlane
77 define amdgpu_ps <4 x float> @insertelement_to_sgpr(<4 x i32> inreg %samp) nounwind {
78 %tmp1 = insertelement <4 x i32> %samp, i32 0, i32 0
77 define <4 x float> @insertelement_to_sgpr() nounwind {
78 %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
79 %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
7980 %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
8081 ret <4 x float> %tmp2
8182 }
153154 }
154155
155156 ; GCN-LABEL: {{^}}dynamic_insertelement_v4i32:
156 ; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
157 ; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}
157158 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
158159 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]]
159160 ; GCN: buffer_store_dwordx4
160 define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind {
161 define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
161162 %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
162163 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
163164 ret void
200201 }
201202
202203 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i8:
203 ; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
204 ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
204 ; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
205 ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
205206 ; VI-NOT: _load
206 ; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8
207207 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
208 ; VI: v_lshlrev_b16_e64 [[ELT1_SHIFT:v[0-9]+]], 8, [[ELT1]]
209 ; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
210208 ; VI: v_lshlrev_b16_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], -1
211
212 ; VI: v_xor_b32_e32 [[NOT:v[0-9]+]], -1, [[MASK]]
213 ; VI: v_or_b32_e32 [[BUILD_VECTOR:v[0-9]+]], [[ELT0]], [[ELT1_SHIFT]]
214
215 ; VI: v_and_b32_e32 [[AND1:v[0-9]+]], [[NOT]], [[BUILD_VECTOR]]
216 ; VI-DAG: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]]
217 ; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[BUILD_VECTOR]]
209 ; VI: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]]
210 ; VI: v_xor_b32_e32 [[NOT_MASK:v[0-9]+]], -1, [[MASK]]
211 ; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[LOAD]], [[NOT_MASK]]
212 ; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[AND_NOT_MASK]]
218213 ; VI: buffer_store_short [[OR]]
219 define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
214 define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
220215 %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
221216 store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
222217 ret void
226221 ; isTypeDesirableForOp in SimplifyDemandedBits
227222
228223 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i8:
229 ; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
230 ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
224 ; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
225 ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
231226 ; VI-NOT: _load
232227
233 ; VI: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[LOAD]], 8
234 ; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[VEC_HI]]
235 ; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
236 ; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[VEC_HI]], [[ELT2]]
237 ; VI: s_and_b32 [[ELT2:s[0-9]+]], [[LOAD]], 0xff0000{{$}}
238
239 ; VI: s_mov_b32 [[MASK16:s[0-9]+]], 0xffff{{$}}
228 ; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
240229 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
241 ; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], [[MASK16]], [[SCALED_IDX]]
242
243 ; VI: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
244 ; VI: v_or_b32_sdwa [[SDWA:v[0-9]+]], [[BUILD_VEC]], [[V_ELT2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
245 ; VI: s_not_b32 [[NOT_SHIFT_MASK:s[0-9]+]], [[SHIFTED_MASK]]
246 ; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[NOT_SHIFT_MASK]], [[SDWA]]
247 ; VI: v_lshrrev_b32_e32 [[HI2:v[0-9]+]], 16, [[AND_NOT_MASK]]
248 ; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SCALED_IDX]], 5, [[SDWA]]
249 ; VI: buffer_store_short [[BFI]]
250 ; VI: buffer_store_byte [[HI2]]
251 define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
230 ; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
231 ; VI: s_not_b32 [[NOT_MASK:s[0-9]+]], [[SHIFTED_MASK]]
232 ; VI: s_and_b32 [[AND_NOT_MASK:s[0-9]+]], [[NOT_MASK]], [[LOAD]]
233 ; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]]
234 ; VI: s_lshr_b32 [[HI2:s[0-9]+]], [[AND_NOT_MASK]], 16
235
236 ; VI-DAG: buffer_store_short [[BFI]]
237 ; VI-DAG: v_mov_b32_e32 [[V_HI2:v[0-9]+]], [[HI2]]
238 ; VI: buffer_store_byte [[V_HI2]]
239 define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
252240 %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
253241 store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
254242 ret void
255243 }
256244
257245 ; GCN-LABEL: {{^}}dynamic_insertelement_v4i8:
258 ; VI: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
259 ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
246 ; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
247 ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
260248 ; VI-NOT: _load
261249
262 ; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 8
263 ; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]]
264 ; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff{{$}}
265
266
267 ; VI: s_lshr_b32 [[ELT3:s[0-9]+]], [[VEC]], 24
268 ; VI: s_lshr_b32 [[ELT2:s[0-9]+]], [[VEC]], 16
269 ; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, 8, [[ELT3]]
270 ; VI: v_or_b32_e32
271 ; VI: v_or_b32_sdwa
250 ; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
272251 ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
273 ; VI: v_or_b32_sdwa
274 ; VI: s_lshl_b32
275 ; VI: v_bfi_b32
276 define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
252 ; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
253 ; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]]
254 ; VI: buffer_store_dword [[BFI]]
255 define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
277256 %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
278257 store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
279258 ret void
280259 }
281260
282261 ; GCN-LABEL: {{^}}s_dynamic_insertelement_v8i8:
283 ; VI-NOT: {{buffer|flat|global}}
284 ; VI: s_load_dword [[IDX:s[0-9]]]
285 ; VI-NOT: {{buffer|flat|global}}
286 ; VI: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
287 ; VI-NOT: {{buffer|flat|global}}
262 ; VI-NOT: {{buffer|flat|global}}_load
263 ; VI-DAG: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
264 ; VI-DAG: s_load_dword [[IDX:s[0-9]]], s[4:5], 0x10
265 ; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0{{$}}
266 ; VI-DAG: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
288267
289268 ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
290 ; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0
291269 ; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff
292270 ; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]]
293271 ; VI: s_not_b64 [[NOT_MASK:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}}
306284
307285 ; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:
308286 ; GCN: s_load_dwordx2
287 ; GCN: s_load_dwordx4
309288 ; GCN: s_load_dword s
310 ; GCN: s_load_dword s
311 ; GCN: s_load_dword s
312 ; GCN: s_load_dword s
313 ; GCN: s_load_dword s
314 ; GCN-NOT: _load_
315
316289
317290 ; GCN: buffer_store_byte
318291 ; GCN: buffer_store_byte
367340
368341 ; GCN-LABEL: {{^}}dynamic_insertelement_v2f64:
369342 ; GCN-DAG: s_load_dwordx4 s{{\[}}[[A_ELT0:[0-9]+]]:[[A_ELT3:[0-9]+]]{{\]}}
370 ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}}
343 ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x18|0x60}}{{$}}
371344
372345 ; GCN-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}
373346
389362
390363 ; GCN: buffer_store_dwordx4
391364 ; GCN: s_endpgm
392 define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
365 define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
393366 %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
394367 store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
395368 ret void
419392 ; space is also 2x what should be required.
420393
421394 ; GCN-LABEL: {{^}}dynamic_insertelement_v4f64:
422 ; GCN: SCRATCH_RSRC_DWORD
423395
424396 ; Stack store
425397
426 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
427 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
398 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}}
399 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}}
428400
429401 ; Write element
430 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
402 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}}
431403
432404 ; Stack reload
433 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
434 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
405 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}}
406 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}}
435407
436408 ; Store result
437409 ; GCN: buffer_store_dwordx4
446418 }
447419
448420 ; GCN-LABEL: {{^}}dynamic_insertelement_v8f64:
449 ; GCN-DAG: SCRATCH_RSRC_DWORD
450
451 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}}
452 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}}
453 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}}
454 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}}
455
456 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
457
458 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}}
459 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}}
460 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}}
461 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}}
421 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:64{{$}}
422 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:80{{$}}
423 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:96{{$}}
424 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:112{{$}}
425
426 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}}
427
428 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:64{{$}}
429 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:80{{$}}
430 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:96{{$}}
431 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:112{{$}}
462432
463433 ; GCN: buffer_store_dwordx4
464434 ; GCN: buffer_store_dwordx4
None ; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
1 ; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s
2 ; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
0 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
1 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s
2 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
33
44 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0:
5 ; GCN: s_load_dword [[VEC:s[0-9]+]]
5 ; GCN: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
66
77 ; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
88 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x3e7{{$}}
1717 }
1818
1919 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reg:
20 ; GCN: s_load_dword [[ELT0:s[0-9]+]]
21 ; GCN: s_load_dword [[VEC:s[0-9]+]]
22
23 ; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
20 ; GCN-DAG: s_load_dword [[ELT_LOAD:s[0-9]+]], s[4:5],
21 ; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
22
23 ; CIVI-DAG: s_and_b32 [[ELT0:s[0-9]+]], [[ELT_LOAD]], 0xffff{{$}}
2424 ; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
2525 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
2626
2727 ; GFX9-NOT: [[ELT0]]
2828 ; GFX9-NOT: [[VEC]]
29 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT0]], [[VEC]]
30 define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
29 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT_LOAD]], [[VEC]]
30 define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
3131 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
3232 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
3333 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
3535 }
3636
3737 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_multi_use_hi_reg:
38 ; GCN: s_load_dword [[ELT0:s[0-9]+]]
39 ; GCN: s_load_dword [[VEC:s[0-9]+]]
40
41 ; CI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
38 ; GCN-DAG: s_load_dword [[ELT_LOAD:s[0-9]+]], s[4:5],
39 ; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
40
41 ; CI-DAG: s_and_b32 [[ELT0_MASKED:s[0-9]+]], [[ELT_LOAD]], 0xffff{{$}}
4242 ; CI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
4343 ; CI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16
44 ; CI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
44 ; CI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0_MASKED]], [[ELT1]]
4545 ; CI-DAG: ; use [[SHR]]
4646
4747
4848 ; FIXME: Should be able to void mask of upper bits
49 ; VI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
49 ; VI-DAG: s_and_b32 [[ELT_MASKED:s[0-9]+]], [[ELT_LOAD]], 0xffff{{$}}
5050 ; VI-DAG: s_and_b32 [[VEC_HIMASK:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
51 ; VI: s_or_b32 [[OR:s[0-9]+]], [[ELT0]], [[VEC_HIMASK]]
52 ; VI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
51 ; VI-DAG: s_or_b32 [[OR:s[0-9]+]], [[ELT_MASKED]], [[VEC_HIMASK]]
52 ; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
5353
5454 ; VI-DAG: ; use [[SHR]]
5555
5656
5757 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
58 ; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
58 ; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT_LOAD]], [[ELT1]]
5959 ; GFX9-DAG: ; use [[ELT1]]
60 define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
60 define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
6161 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
6262 %elt1 = extractelement <2 x i16> %vec, i32 1
6363 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
6868 }
6969
7070 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi:
71 ; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
72 ; GCN: s_load_dword [[VEC:s[0-9]+]]
73
71 ; GCN-DAG: s_load_dword [[ELT_ARG:s[0-9]+]], s[4:5],
72 ; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
73
74 ; CIVI: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
7475 ; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
75 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_ARG]], [[ELT1]]
76 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[ELT1]]
7677
7778 ; GFX9-NOT: [[ELT0]]
7879 ; GFX9-NOT: [[VEC]]
7980 ; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]]
80 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
81 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %elt.arg) #0 {
8182 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
8283 %elt.hi = lshr i32 %elt.arg, 16
8384 %elt = trunc i32 %elt.hi to i16
8788 }
8889
8990 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_multi_use_1:
90 ; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
91 ; GCN: s_load_dword [[ELT_ARG:s[0-9]+]],
9192 ; GCN: s_load_dword [[VEC:s[0-9]+]],
9293
9394 ; CIVI-DAG: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16
109110 }
110111
111112 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_both_multi_use_1:
112 ; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
113 ; GCN: s_load_dword [[ELT_ARG:s[0-9]+]],
113114 ; GCN: s_load_dword [[VEC:s[0-9]+]],
114115
115116 ; CI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
160161 }
161162
162163 ; GCN-LABEL: {{^}}s_insertelement_v2i16_1_reg:
163 ; GCN: s_load_dword [[ELT1:s[0-9]+]]
164 ; GCN: s_load_dword [[VEC:s[0-9]+]]
165
164 ; GCN-DAG: s_load_dword [[ELT1_LOAD:s[0-9]+]], s[4:5],
165 ; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
166
167 ; CIVI: s_lshl_b32 [[ELT1:s[0-9]+]], [[ELT1_LOAD]], 16
166168 ; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
167169 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
168170
169171 ; GCN-NOT: shlr
170 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1]]
171 define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
172 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1_LOAD]]
173 define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
172174 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
173175 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
174176 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
443445
444446 ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
445447 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
446 ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
448 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
447449
448450 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
449451 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
450452
451 ; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
452453 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
453454 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
454455
472473
473474 ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
474475 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
475 ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
476 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
476477
477478 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
478479 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
479480
480 ; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
481481 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
482482 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
483483
500500 }
501501
502502 ; GCN-LABEL: {{^}}v_insertelement_v4f16_0:
503 ; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
503 ; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[4:5],
504504 ; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
505505
506506 ; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
507507 ; GFX9: v_bfi_b32 v[[INS_LO:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[LO]]
508508
509 ; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}}
509510 ; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[LO]]
510 ; CIVI: v_or_b32_e32 v[[INS_LO:[0-9]+]], [[VAL]], [[AND]]
511 ; CIVI: v_or_b32_e32 v[[INS_LO:[0-9]+]], [[VAL_MASKED]], [[AND]]
511512
512513 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_LO]]:[[HI]]{{\]}}
513 define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
514 define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
514515 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
515516 %tid.ext = sext i32 %tid to i64
516517 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
530531 ; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]]
531532 ; GFX9: v_lshl_or_b32 v[[INS_HALF:[0-9]+]], [[VAL]], 16, [[AND]]
532533
533 ; VI: s_lshl_b32 [[VAL]], [[VAL]], 16
534 ; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL]]
534 ; VI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
535 ; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL_HI]]
535536 ; VI: v_or_b32_sdwa v[[INS_HALF:[0-9]+]], v[[LO]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
536537
538 ; CI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
537539 ; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]]
538 ; CI: v_or_b32_e32 v[[INS_HALF:[0-9]+]], [[VAL]], [[AND]]
540 ; CI: v_or_b32_e32 v[[INS_HALF:[0-9]+]], [[VAL_HI]], [[AND]]
539541
540542 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_HALF]]:[[HI]]{{\]}}
541543 define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
552554 }
553555
554556 ; GCN-LABEL: {{^}}v_insertelement_v4f16_2:
555 ; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
557 ; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[4:5],
556558 ; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
557559
558560 ; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
559561 ; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]]
560562
563 ; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}}
561564 ; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]]
562 ; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]
565 ; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_MASKED]], [[AND]]
563566
564567 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
565 define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
568 define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
566569 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
567570 %tid.ext = sext i32 %tid to i64
568571 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
582585 ; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]]
583586 ; GFX9: v_lshl_or_b32 v[[INS_HI:[0-9]+]], [[VAL]], 16, [[AND]]
584587
585 ; VI: s_lshl_b32 [[VAL]], [[VAL]], 16
586 ; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL]]
588 ; VI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
589 ; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL_HI]]
587590 ; VI: v_or_b32_sdwa v[[INS_HI:[0-9]+]], v[[HI]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
588591
592 ; CI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
589593 ; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]]
590 ; CI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]
594 ; CI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_HI]], [[AND]]
591595
592596 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
593597 define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
610614 ; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
611615 ; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]]
612616
617 ; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}}
613618 ; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]]
614 ; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]
619 ; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_MASKED]], [[AND]]
615620
616621 ; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
617622 define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
209209 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
210210 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
211211
212 ; GCN: s_load_dword s
213 ; GCN-NOT: {{buffer|flat|global}}_load_
212 ; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
213
214 ; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
215 ; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
214216 define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
215217 entry:
216218 store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
225227 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
226228 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
227229
228 ; SI: s_load_dword s
229 ; SI: s_load_dword s
230 ; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
230231
231232 ; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
232233 ; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
235236 store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
236237 ret void
237238 }
239
238240 ; FUNC-LABEL: {{^}}v3i32_arg:
239241 ; HSA-VI: kernarg_segment_byte_size = 32
240242 ; HSA-VI: kernarg_segment_alignment = 4
273275 ; EG: VTX_READ_8
274276 ; EG: VTX_READ_8
275277
276 ; GCN: s_load_dword s
277 ; GCN-NOT: {{buffer|flat|global}}_load_
278 ; GCN-DAG: s_load_dwordx2 s
279 ; GCN-DAG: s_load_dword s
278280 define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
279281 entry:
280282 store <4 x i8> %in, <4 x i8> addrspace(1)* %out
289291 ; EG: VTX_READ_16
290292 ; EG: VTX_READ_16
291293
292 ; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
293 ; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc
294 ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
294295 ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
295296
296 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x2c
297 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
297 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
298 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
299
300
301 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
302 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
303
304 ; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
305 ; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
298306 define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
299307 entry:
300308 store <4 x i16> %in, <4 x i16> addrspace(1)* %out
347355 ; EG: VTX_READ_8
348356 ; EG: VTX_READ_8
349357
350
351 ; SI: s_load_dword s
352 ; SI: s_load_dword s
358 ; SI-NOT: {{buffer|flat|global}}_load
353359 ; SI: s_load_dwordx2 s
360 ; SI-NEXT: s_load_dwordx2 s
354361 ; SI-NOT: {{buffer|flat|global}}_load
355362
356 ; VI: s_load_dword s
357 ; VI: s_load_dword s
358
359 ; VI: v_lshlrev_b16
360 ; VI: v_or_b32_e32
361 ; VI: v_or_b32_sdwa
362 ; VI: v_or_b32_sdwa
363 ; VI: v_lshlrev_b16
364 ; VI: s_lshr_b32
365 ; VI: v_or_b32_sdwa
366 ; VI: v_or_b32_sdwa
363 ; VI: s_load_dwordx2 s