llvm.org GIT mirror llvm / d9b720d
Merging r181580: ------------------------------------------------------------------------ r181580 | tstellar | 2013-05-09 19:09:45 -0700 (Thu, 09 May 2013) | 10 lines R600: Remove AMDILPeeopholeOptimizer and replace optimizations with tablegen patterns The BFE optimization was the only one we were actually using, and it was emitting an intrinsic that we don't support. https://bugs.freedesktop.org/show_bug.cgi?id=64201 Reviewed-by: Christian K├Ânig <christian.koenig@amd.com> NOTE: This is a candidate for the 3.3 branch. ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_33@181954 91177308-0d34-0410-b5e6-96231b3b80d8 Bill Wendling 7 years ago
6 changed file(s) with 38 addition(s) and 1217 deletion(s). Raw diff Collapse all Expand all
283283 (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
284284 >;
285285
286 // Bitfield extract patterns
287
288 def legalshift32 : ImmLeaf =0 && Imm < 32;}]>;
289 def bfemask : PatLeaf <(imm), [{return isMask_32(N->getZExtValue());}],
290 SDNodeXFormgetTargetConstant(CountTrailingOnes_32(N->getZExtValue()), MVT::i32);}]>>;
291
292 class BFEPattern : Pat <
293 (and (srl i32:$x, legalshift32:$y), bfemask:$z),
294 (BFE $x, $y, $z)
295 >;
296
286297 include "R600Instructions.td"
287298
288299 include "SIInstrInfo.td"
114114 }
115115
116116 bool AMDGPUPassConfig::addInstSelector() {
117 addPass(createAMDGPUPeepholeOpt(*TM));
118117 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
119118
120119 const AMDGPUSubtarget &ST = TM->getSubtarget();
+0
-1215
lib/Target/R600/AMDILPeepholeOptimizer.cpp less more
None //===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 /// \file
8 //==-----------------------------------------------------------------------===//
9
10 #define DEBUG_TYPE "PeepholeOpt"
11 #ifdef DEBUG
12 #define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
13 #else
14 #define DEBUGME 0
15 #endif
16
17 #include "AMDILDevices.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/ADT/StringExtras.h"
21 #include "llvm/ADT/StringRef.h"
22 #include "llvm/ADT/Twine.h"
23 #include "llvm/IR/Constants.h"
24 #include "llvm/CodeGen/MachineFunction.h"
25 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
26 #include "llvm/IR/Function.h"
27 #include "llvm/IR/Instructions.h"
28 #include "llvm/IR/Module.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/MathExtras.h"
31
32 #include
33
34 #if 0
35 STATISTIC(PointerAssignments, "Number of dynamic pointer "
36 "assigments discovered");
37 STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
38 #endif
39
40 using namespace llvm;
41 // The Peephole optimization pass is used to do simple last minute optimizations
42 // that are required for correct code or to remove redundant functions
43 namespace {
44
45 class OpaqueType;
46
47 class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
48 public:
49 TargetMachine &TM;
50 static char ID;
51 AMDGPUPeepholeOpt(TargetMachine &tm);
52 ~AMDGPUPeepholeOpt();
53 const char *getPassName() const;
54 bool runOnFunction(Function &F);
55 bool doInitialization(Module &M);
56 bool doFinalization(Module &M);
57 void getAnalysisUsage(AnalysisUsage &AU) const;
58 protected:
59 private:
60 // Function to initiate all of the instruction level optimizations.
61 bool instLevelOptimizations(BasicBlock::iterator *inst);
62 // Quick check to see if we need to dump all of the pointers into the
63 // arena. If this is correct, then we set all pointers to exist in arena. This
64 // is a workaround for aliasing of pointers in a struct/union.
65 bool dumpAllIntoArena(Function &F);
66 // Because I don't want to invalidate any pointers while in the
67 // safeNestedForEachFunction. I push atomic conversions to a vector and handle
68 // it later. This function does the conversions if required.
69 void doAtomicConversionIfNeeded(Function &F);
70 // Because __amdil_is_constant cannot be properly evaluated if
71 // optimizations are disabled, the call's are placed in a vector
72 // and evaluated after the __amdil_image* functions are evaluated
73 // which should allow the __amdil_is_constant function to be
74 // evaluated correctly.
75 void doIsConstCallConversionIfNeeded();
76 bool mChanged;
77 bool mDebug;
78 bool mConvertAtomics;
79 CodeGenOpt::Level optLevel;
80 // Run a series of tests to see if we can optimize a CALL instruction.
81 bool optimizeCallInst(BasicBlock::iterator *bbb);
82 // A peephole optimization to optimize bit extract sequences.
83 bool optimizeBitExtract(Instruction *inst);
84 // A peephole optimization to optimize bit insert sequences.
85 bool optimizeBitInsert(Instruction *inst);
86 bool setupBitInsert(Instruction *base,
87 Instruction *&src,
88 Constant *&mask,
89 Constant *&shift);
90 // Expand the bit field insert instruction on versions of OpenCL that
91 // don't support it.
92 bool expandBFI(CallInst *CI);
93 // Expand the bit field mask instruction on version of OpenCL that
94 // don't support it.
95 bool expandBFM(CallInst *CI);
96 // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
97 // this case we need to expand them. These functions check for 24bit functions
98 // and then expand.
99 bool isSigned24BitOps(CallInst *CI);
100 void expandSigned24BitOps(CallInst *CI);
101 // One optimization that can occur is that if the required workgroup size is
102 // specified then the result of get_local_size is known at compile time and
103 // can be returned accordingly.
104 bool isRWGLocalOpt(CallInst *CI);
105 // On northern island cards, the division is slightly less accurate than on
106 // previous generations, so we need to utilize a more accurate division. So we
107 // can translate the accurate divide to a normal divide on all other cards.
108 bool convertAccurateDivide(CallInst *CI);
109 void expandAccurateDivide(CallInst *CI);
110 // If the alignment is set incorrectly, it can produce really inefficient
111 // code. This checks for this scenario and fixes it if possible.
112 bool correctMisalignedMemOp(Instruction *inst);
113
114 // If we are in no opt mode, then we need to make sure that
115 // local samplers are properly propagated as constant propagation
116 // doesn't occur and we need to know the value of kernel defined
117 // samplers at compile time.
118 bool propagateSamplerInst(CallInst *CI);
119
120 // Helper functions
121
122 // Group of functions that recursively calculate the size of a structure based
123 // on it's sub-types.
124 size_t getTypeSize(Type * const T, bool dereferencePtr = false);
125 size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
126 size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
127 size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
128 size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
129 size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
130 size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
131 size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
132
133 LLVMContext *mCTX;
134 Function *mF;
135 const AMDGPUSubtarget *mSTM;
136 SmallVector< std::pair, 16> atomicFuncs;
137 SmallVector isConstVec;
138 }; // class AMDGPUPeepholeOpt
139 char AMDGPUPeepholeOpt::ID = 0;
140
141 // A template function that has two levels of looping before calling the
142 // function with a pointer to the current iterator.
143 template
144 Function safeNestedForEach(InputIterator First, InputIterator Last,
145 SecondIterator S, Function F) {
146 for ( ; First != Last; ++First) {
147 SecondIterator sf, sl;
148 for (sf = First->begin(), sl = First->end();
149 sf != sl; ) {
150 if (!F(&sf)) {
151 ++sf;
152 }
153 }
154 }
155 return F;
156 }
157
158 } // anonymous namespace
159
160 namespace llvm {
161 FunctionPass *
162 createAMDGPUPeepholeOpt(TargetMachine &tm) {
163 return new AMDGPUPeepholeOpt(tm);
164 }
165 } // llvm namespace
166
167 AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
168 : FunctionPass(ID), TM(tm) {
169 mDebug = DEBUGME;
170 optLevel = TM.getOptLevel();
171
172 }
173
174 AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() {
175 }
176
177 const char *
178 AMDGPUPeepholeOpt::getPassName() const {
179 return "AMDGPU PeepHole Optimization Pass";
180 }
181
182 bool
183 containsPointerType(Type *Ty) {
184 if (!Ty) {
185 return false;
186 }
187 switch(Ty->getTypeID()) {
188 default:
189 return false;
190 case Type::StructTyID: {
191 const StructType *ST = dyn_cast(Ty);
192 for (StructType::element_iterator stb = ST->element_begin(),
193 ste = ST->element_end(); stb != ste; ++stb) {
194 if (!containsPointerType(*stb)) {
195 continue;
196 }
197 return true;
198 }
199 break;
200 }
201 case Type::VectorTyID:
202 case Type::ArrayTyID:
203 return containsPointerType(dyn_cast(Ty)->getElementType());
204 case Type::PointerTyID:
205 return true;
206 };
207 return false;
208 }
209
210 bool
211 AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) {
212 bool dumpAll = false;
213 for (Function::const_arg_iterator cab = F.arg_begin(),
214 cae = F.arg_end(); cab != cae; ++cab) {
215 const Argument *arg = cab;
216 const PointerType *PT = dyn_cast(arg->getType());
217 if (!PT) {
218 continue;
219 }
220 Type *DereferencedType = PT->getElementType();
221 if (!dyn_cast(DereferencedType)
222 ) {
223 continue;
224 }
225 if (!containsPointerType(DereferencedType)) {
226 continue;
227 }
228 // FIXME: Because a pointer inside of a struct/union may be aliased to
229 // another pointer we need to take the conservative approach and place all
230 // pointers into the arena until more advanced detection is implemented.
231 dumpAll = true;
232 }
233 return dumpAll;
234 }
235 void
236 AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {
237 if (isConstVec.empty()) {
238 return;
239 }
240 for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
241 CallInst *CI = isConstVec[x];
242 Constant *CV = dyn_cast(CI->getOperand(0));
243 Type *aType = Type::getInt32Ty(*mCTX);
244 Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
245 : ConstantInt::get(aType, 0);
246 CI->replaceAllUsesWith(Val);
247 CI->eraseFromParent();
248 }
249 isConstVec.clear();
250 }
251 void
252 AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) {
253 // Don't do anything if we don't have any atomic operations.
254 if (atomicFuncs.empty()) {
255 return;
256 }
257 // Change the function name for the atomic if it is required
258 uint32_t size = atomicFuncs.size();
259 for (uint32_t x = 0; x < size; ++x) {
260 atomicFuncs[x].first->setOperand(
261 atomicFuncs[x].first->getNumOperands()-1,
262 atomicFuncs[x].second);
263
264 }
265 mChanged = true;
266 if (mConvertAtomics) {
267 return;
268 }
269 }
270
271 bool
272 AMDGPUPeepholeOpt::runOnFunction(Function &MF) {
273 mChanged = false;
274 mF = &MF;
275 mSTM = &TM.getSubtarget();
276 if (mDebug) {
277 MF.dump();
278 }
279 mCTX = &MF.getType()->getContext();
280 mConvertAtomics = true;
281 safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
282 std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
283 this));
284
285 doAtomicConversionIfNeeded(MF);
286 doIsConstCallConversionIfNeeded();
287
288 if (mDebug) {
289 MF.dump();
290 }
291 return mChanged;
292 }
293
294 bool
295 AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) {
296 Instruction *inst = (*bbb);
297 CallInst *CI = dyn_cast(inst);
298 if (!CI) {
299 return false;
300 }
301 if (isSigned24BitOps(CI)) {
302 expandSigned24BitOps(CI);
303 ++(*bbb);
304 CI->eraseFromParent();
305 return true;
306 }
307 if (propagateSamplerInst(CI)) {
308 return false;
309 }
310 if (expandBFI(CI) || expandBFM(CI)) {
311 ++(*bbb);
312 CI->eraseFromParent();
313 return true;
314 }
315 if (convertAccurateDivide(CI)) {
316 expandAccurateDivide(CI);
317 ++(*bbb);
318 CI->eraseFromParent();
319 return true;
320 }
321
322 StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
323 if (calleeName.startswith("__amdil_is_constant")) {
324 // If we do not have optimizations, then this
325 // cannot be properly evaluated, so we add the
326 // call instruction to a vector and process
327 // them at the end of processing after the
328 // samplers have been correctly handled.
329 if (optLevel == CodeGenOpt::None) {
330 isConstVec.push_back(CI);
331 return false;
332 } else {
333 Constant *CV = dyn_cast(CI->getOperand(0));
334 Type *aType = Type::getInt32Ty(*mCTX);
335 Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
336 : ConstantInt::get(aType, 0);
337 CI->replaceAllUsesWith(Val);
338 ++(*bbb);
339 CI->eraseFromParent();
340 return true;
341 }
342 }
343
344 if (calleeName.equals("__amdil_is_asic_id_i32")) {
345 ConstantInt *CV = dyn_cast(CI->getOperand(0));
346 Type *aType = Type::getInt32Ty(*mCTX);
347 Value *Val = CV;
348 if (Val) {
349 Val = ConstantInt::get(aType,
350 mSTM->device()->getDeviceFlag() & CV->getZExtValue());
351 } else {
352 Val = ConstantInt::get(aType, 0);
353 }
354 CI->replaceAllUsesWith(Val);
355 ++(*bbb);
356 CI->eraseFromParent();
357 return true;
358 }
359 Function *F = dyn_cast(CI->getOperand(CI->getNumOperands()-1));
360 if (!F) {
361 return false;
362 }
363 if (F->getName().startswith("__atom") && !CI->getNumUses()
364 && F->getName().find("_xchg") == StringRef::npos) {
365 std::string buffer(F->getName().str() + "_noret");
366 F = dyn_cast(
367 F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
368 atomicFuncs.push_back(std::make_pair(CI, F));
369 }
370
371 if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
372 && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
373 return false;
374 }
375 if (!mConvertAtomics) {
376 return false;
377 }
378 StringRef name = F->getName();
379 if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
380 mConvertAtomics = false;
381 }
382 return false;
383 }
384
385 bool
386 AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,
387 Instruction *&src,
388 Constant *&mask,
389 Constant *&shift) {
390 if (!base) {
391 if (mDebug) {
392 dbgs() << "Null pointer passed into function.\n";
393 }
394 return false;
395 }
396 bool andOp = false;
397 if (base->getOpcode() == Instruction::Shl) {
398 shift = dyn_cast(base->getOperand(1));
399 } else if (base->getOpcode() == Instruction::And) {
400 mask = dyn_cast(base->getOperand(1));
401 andOp = true;
402 } else {
403 if (mDebug) {
404 dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
405 }
406 // If the base is neither a Shl or a And, we don't fit any of the patterns above.
407 return false;
408 }
409 src = dyn_cast(base->getOperand(0));
410 if (!src) {
411 if (mDebug) {
412 dbgs() << "Failed setup since the base operand is not an instruction!\n";
413 }
414 return false;
415 }
416 // If we find an 'and' operation, then we don't need to
417 // find the next operation as we already know the
418 // bits that are valid at this point.
419 if (andOp) {
420 return true;
421 }
422 if (src->getOpcode() == Instruction::Shl && !shift) {
423 shift = dyn_cast(src->getOperand(1));
424 src = dyn_cast(src->getOperand(0));
425 } else if (src->getOpcode() == Instruction::And && !mask) {
426 mask = dyn_cast(src->getOperand(1));
427 }
428 if (!mask && !shift) {
429 if (mDebug) {
430 dbgs() << "Failed setup since both mask and shift are NULL!\n";
431 }
432 // Did not find a constant mask or a shift.
433 return false;
434 }
435 return true;
436 }
437 bool
438 AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) {
439 if (!inst) {
440 return false;
441 }
442 if (!inst->isBinaryOp()) {
443 return false;
444 }
445 if (inst->getOpcode() != Instruction::Or) {
446 return false;
447 }
448 if (optLevel == CodeGenOpt::None) {
449 return false;
450 }
451 // We want to do an optimization on a sequence of ops that in the end equals a
452 // single ISA instruction.
453 // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
454 // Some simplified versions of this pattern are as follows:
455 // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
456 // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
457 // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
458 // (A & B) | (D << F) when (1 << F) >= B
459 // (A << C) | (D & E) when (1 << C) >= E
460 if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
461 // The HD4XXX hardware doesn't support the ubit_insert instruction.
462 return false;
463 }
464 Type *aType = inst->getType();
465 bool isVector = aType->isVectorTy();
466 int numEle = 1;
467 // This optimization only works on 32bit integers.
468 if (aType->getScalarType()
469 != Type::getInt32Ty(inst->getContext())) {
470 return false;
471 }
472 if (isVector) {
473 const VectorType *VT = dyn_cast(aType);
474 numEle = VT->getNumElements();
475 // We currently cannot support more than 4 elements in a intrinsic and we
476 // cannot support Vec3 types.
477 if (numEle > 4 || numEle == 3) {
478 return false;
479 }
480 }
481 // TODO: Handle vectors.
482 if (isVector) {
483 if (mDebug) {
484 dbgs() << "!!! Vectors are not supported yet!\n";
485 }
486 return false;
487 }
488 Instruction *LHSSrc = NULL, *RHSSrc = NULL;
489 Constant *LHSMask = NULL, *RHSMask = NULL;
490 Constant *LHSShift = NULL, *RHSShift = NULL;
491 Instruction *LHS = dyn_cast(inst->getOperand(0));
492 Instruction *RHS = dyn_cast(inst->getOperand(1));
493 if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
494 if (mDebug) {
495 dbgs() << "Found an OR Operation that failed setup!\n";
496 inst->dump();
497 if (LHS) { LHS->dump(); }
498 if (LHSSrc) { LHSSrc->dump(); }
499 if (LHSMask) { LHSMask->dump(); }
500 if (LHSShift) { LHSShift->dump(); }
501 }
502 // There was an issue with the setup for BitInsert.
503 return false;
504 }
505 if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
506 if (mDebug) {
507 dbgs() << "Found an OR Operation that failed setup!\n";
508 inst->dump();
509 if (RHS) { RHS->dump(); }
510 if (RHSSrc) { RHSSrc->dump(); }
511 if (RHSMask) { RHSMask->dump(); }
512 if (RHSShift) { RHSShift->dump(); }
513 }
514 // There was an issue with the setup for BitInsert.
515 return false;
516 }
517 if (mDebug) {
518 dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
519 dbgs() << "Op: "; inst->dump();
520 dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
521 dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
522 dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
523 dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
524 dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
525 dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
526 dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
527 dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
528 }
529 Constant *offset = NULL;
530 Constant *width = NULL;
531 uint32_t lhsMaskVal = 0, rhsMaskVal = 0;
532 uint32_t lhsShiftVal = 0, rhsShiftVal = 0;
533 uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
534 uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
535 lhsMaskVal = (LHSMask
536 ? dyn_cast(LHSMask)->getZExtValue() : 0);
537 rhsMaskVal = (RHSMask
538 ? dyn_cast(RHSMask)->getZExtValue() : 0);
539 lhsShiftVal = (LHSShift
540 ? dyn_cast(LHSShift)->getZExtValue() : 0);
541 rhsShiftVal = (RHSShift
542 ? dyn_cast(RHSShift)->getZExtValue() : 0);
543 lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
544 rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
545 lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
546 rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
547 // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
548 if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
549 return false;
550 }
551 if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
552 offset = ConstantInt::get(aType, lhsMaskOffset, false);
553 width = ConstantInt::get(aType, lhsMaskWidth, false);
554 RHSSrc = RHS;
555 if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
556 return false;
557 }
558 if (!LHSShift) {
559 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
560 "MaskShr", LHS);
561 } else if (lhsShiftVal != lhsMaskOffset) {
562 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
563 "MaskShr", LHS);
564 }
565 if (mDebug) {
566 dbgs() << "Optimizing LHS!\n";
567 }
568 } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
569 offset = ConstantInt::get(aType, rhsMaskOffset, false);
570 width = ConstantInt::get(aType, rhsMaskWidth, false);
571 LHSSrc = RHSSrc;
572 RHSSrc = LHS;
573 if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
574 return false;
575 }
576 if (!RHSShift) {
577 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
578 "MaskShr", RHS);
579 } else if (rhsShiftVal != rhsMaskOffset) {
580 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
581 "MaskShr", RHS);
582 }
583 if (mDebug) {
584 dbgs() << "Optimizing RHS!\n";
585 }
586 } else {
587 if (mDebug) {
588 dbgs() << "Failed constraint 3!\n";
589 }
590 return false;
591 }
592 if (mDebug) {
593 dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
594 dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
595 dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
596 dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
597 }
598 if (!offset || !width) {
599 if (mDebug) {
600 dbgs() << "Either width or offset are NULL, failed detection!\n";
601 }
602 return false;
603 }
604 // Lets create the function signature.
605 std::vector callTypes;
606 callTypes.push_back(aType);
607 callTypes.push_back(aType);
608 callTypes.push_back(aType);
609 callTypes.push_back(aType);
610 FunctionType *funcType = FunctionType::get(aType, callTypes, false);
611 std::string name = "__amdil_ubit_insert";
612 if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
613 Function *Func =
614 dyn_cast(inst->getParent()->getParent()->getParent()->
615 getOrInsertFunction(StringRef(name), funcType));
616 Value *Operands[4] = {
617 width,
618 offset,
619 LHSSrc,
620 RHSSrc
621 };
622 CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
623 if (mDebug) {
624 dbgs() << "Old Inst: ";
625 inst->dump();
626 dbgs() << "New Inst: ";
627 CI->dump();
628 dbgs() << "\n\n";
629 }
630 CI->insertBefore(inst);
631 inst->replaceAllUsesWith(CI);
632 return true;
633 }
634
635 bool
636 AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) {
637 if (!inst) {
638 return false;
639 }
640 if (!inst->isBinaryOp()) {
641 return false;
642 }
643 if (inst->getOpcode() != Instruction::And) {
644 return false;
645 }
646 if (optLevel == CodeGenOpt::None) {
647 return false;
648 }
649 // We want to do some simple optimizations on Shift right/And patterns. The
650 // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
651 // value smaller than 32 and C is a mask. If C is a constant value, then the
652 // following transformation can occur. For signed integers, it turns into the
653 // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
654 // integers, it turns into the function call dst =
655 // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
656 // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
657 // Evergreen hardware.
658 if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
659 // This does not work on HD4XXX hardware.
660 return false;
661 }
662 Type *aType = inst->getType();
663 bool isVector = aType->isVectorTy();
664
665 // XXX Support vector types
666 if (isVector) {
667 return false;
668 }
669 int numEle = 1;
670 // This only works on 32bit integers
671 if (aType->getScalarType()
672 != Type::getInt32Ty(inst->getContext())) {
673 return false;
674 }
675 if (isVector) {
676 const VectorType *VT = dyn_cast(aType);
677 numEle = VT->getNumElements();
678 // We currently cannot support more than 4 elements in a intrinsic and we
679 // cannot support Vec3 types.
680 if (numEle > 4 || numEle == 3) {
681 return false;
682 }
683 }
684 BinaryOperator *ShiftInst = dyn_cast(inst->getOperand(0));
685 // If the first operand is not a shift instruction, then we can return as it
686 // doesn't match this pattern.
687 if (!ShiftInst || !ShiftInst->isShift()) {
688 return false;
689 }
690 // If we are a shift left, then we need don't match this pattern.
691 if (ShiftInst->getOpcode() == Instruction::Shl) {
692 return false;
693 }
694 bool isSigned = ShiftInst->isArithmeticShift();
695 Constant *AndMask = dyn_cast(inst->getOperand(1));
696 Constant *ShrVal = dyn_cast(ShiftInst->getOperand(1));
697 // Lets make sure that the shift value and the and mask are constant integers.
698 if (!AndMask || !ShrVal) {
699 return false;
700 }
701 Constant *newMaskConst;
702 Constant *shiftValConst;
703 if (isVector) {
704 // Handle the vector case
705 std::vector maskVals;
706 std::vector shiftVals;
707 ConstantVector *AndMaskVec = dyn_cast(AndMask);
708 ConstantVector *ShrValVec = dyn_cast(ShrVal);
709 Type *scalarType = AndMaskVec->getType()->getScalarType();
710 assert(AndMaskVec->getNumOperands() ==
711 ShrValVec->getNumOperands() && "cannot have a "
712 "combination where the number of elements to a "
713 "shift and an and are different!");
714 for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
715 ConstantInt *AndCI = dyn_cast(AndMaskVec->getOperand(x));
716 ConstantInt *ShiftIC = dyn_cast(ShrValVec->getOperand(x));
717 if (!AndCI || !ShiftIC) {
718 return false;
719 }
720 uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
721 if (!isMask_32(maskVal)) {
722 return false;
723 }
724 maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
725 uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
726 // If the mask or shiftval is greater than the bitcount, then break out.
727 if (maskVal >= 32 || shiftVal >= 32) {
728 return false;
729 }
730 // If the mask val is greater than the the number of original bits left
731 // then this optimization is invalid.
732 if (maskVal > (32 - shiftVal)) {
733 return false;
734 }
735 maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
736 shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
737 }
738 newMaskConst = ConstantVector::get(maskVals);
739 shiftValConst = ConstantVector::get(shiftVals);
740 } else {
741 // Handle the scalar case
742 uint32_t maskVal = (uint32_t)dyn_cast(AndMask)->getZExtValue();
743 // This must be a mask value where all lower bits are set to 1 and then any
744 // bit higher is set to 0.
745 if (!isMask_32(maskVal)) {
746 return false;
747 }
748 maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
749 // Count the number of bits set in the mask, this is the width of the
750 // resulting bit set that is extracted from the source value.
751 uint32_t shiftVal = (uint32_t)dyn_cast(ShrVal)->getZExtValue();
752 // If the mask or shift val is greater than the bitcount, then break out.
753 if (maskVal >= 32 || shiftVal >= 32) {
754 return false;
755 }
756 // If the mask val is greater than the the number of original bits left then
757 // this optimization is invalid.
758 if (maskVal > (32 - shiftVal)) {
759 return false;
760 }
761 newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
762 shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
763 }
764 // Lets create the function signature.
765 std::vector callTypes;
766 callTypes.push_back(aType);
767 callTypes.push_back(aType);
768 callTypes.push_back(aType);
769 FunctionType *funcType = FunctionType::get(aType, callTypes, false);
770 std::string name = "llvm.AMDGPU.bit.extract.u32";
771 if (isVector) {
772 name += ".v" + itostr(numEle) + "i32";
773 } else {
774 name += ".";
775 }
776 // Lets create the function.
777 Function *Func =
778 dyn_cast(inst->getParent()->getParent()->getParent()->
779 getOrInsertFunction(StringRef(name), funcType));
780 Value *Operands[3] = {
781 ShiftInst->getOperand(0),
782 shiftValConst,
783 newMaskConst
784 };
785 // Lets create the Call with the operands
786 CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
787 CI->setDoesNotAccessMemory();
788 CI->insertBefore(inst);
789 inst->replaceAllUsesWith(CI);
790 return true;
791 }
792
793 bool
794 AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {
795 if (!CI) {
796 return false;
797 }
798 Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
799 if (!LHS->getName().startswith("__amdil_bfi")) {
800 return false;
801 }
802 Type* type = CI->getOperand(0)->getType();
803 Constant *negOneConst = NULL;
804 if (type->isVectorTy()) {
805 std::vector negOneVals;
806 negOneConst = ConstantInt::get(CI->getContext(),
807 APInt(32, StringRef("-1"), 10));
808 for (size_t x = 0,
809 y = dyn_cast(type)->getNumElements(); x < y; ++x) {
810 negOneVals.push_back(negOneConst);
811 }
812 negOneConst = ConstantVector::get(negOneVals);
813 } else {
814 negOneConst = ConstantInt::get(CI->getContext(),
815 APInt(32, StringRef("-1"), 10));
816 }
817 // __amdil_bfi => (A & B) | (~A & C)
818 BinaryOperator *lhs =
819 BinaryOperator::Create(Instruction::And, CI->getOperand(0),
820 CI->getOperand(1), "bfi_and", CI);
821 BinaryOperator *rhs =
822 BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
823 "bfi_not", CI);
824 rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
825 "bfi_and", CI);
826 lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
827 CI->replaceAllUsesWith(lhs);
828 return true;
829 }
830
831 bool
832 AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {
833 if (!CI) {
834 return false;
835 }
836 Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
837 if (!LHS->getName().startswith("__amdil_bfm")) {
838 return false;
839 }
840 // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
841 Constant *newMaskConst = NULL;
842 Constant *newShiftConst = NULL;
843 Type* type = CI->getOperand(0)->getType();
844 if (type->isVectorTy()) {
845 std::vector newMaskVals, newShiftVals;
846 newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
847 newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
848 for (size_t x = 0,
849 y = dyn_cast(type)->getNumElements(); x < y; ++x) {
850 newMaskVals.push_back(newMaskConst);
851 newShiftVals.push_back(newShiftConst);
852 }
853 newMaskConst = ConstantVector::get(newMaskVals);
854 newShiftConst = ConstantVector::get(newShiftVals);
855 } else {
856 newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
857 newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
858 }
859 BinaryOperator *lhs =
860 BinaryOperator::Create(Instruction::And, CI->getOperand(0),
861 newMaskConst, "bfm_mask", CI);
862 lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
863 lhs, "bfm_shl", CI);
864 lhs = BinaryOperator::Create(Instruction::Sub, lhs,
865 newShiftConst, "bfm_sub", CI);
866 BinaryOperator *rhs =
867 BinaryOperator::Create(Instruction::And, CI->getOperand(1),
868 newMaskConst, "bfm_mask", CI);
869 lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
870 CI->replaceAllUsesWith(lhs);
871 return true;
872 }
873
874 bool
875 AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) {
876 Instruction *inst = (*bbb);
877 if (optimizeCallInst(bbb)) {
878 return true;
879 }
880 if (optimizeBitExtract(inst)) {
881 return false;
882 }
883 if (optimizeBitInsert(inst)) {
884 return false;
885 }
886 if (correctMisalignedMemOp(inst)) {
887 return false;
888 }
889 return false;
890 }
891 bool
892 AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {
893 LoadInst *linst = dyn_cast(inst);
894 StoreInst *sinst = dyn_cast(inst);
895 unsigned alignment;
896 Type* Ty = inst->getType();
897 if (linst) {
898 alignment = linst->getAlignment();
899 Ty = inst->getType();
900 } else if (sinst) {
901 alignment = sinst->getAlignment();
902 Ty = sinst->getValueOperand()->getType();
903 } else {
904 return false;
905 }
906 unsigned size = getTypeSize(Ty);
907 if (size == alignment || size < alignment) {
908 return false;
909 }
910 if (!Ty->isStructTy()) {
911 return false;
912 }
913 if (alignment < 4) {
914 if (linst) {
915 linst->setAlignment(0);
916 return true;
917 } else if (sinst) {
918 sinst->setAlignment(0);
919 return true;
920 }
921 }
922 return false;
923 }
924 bool
925 AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) {
926 if (!CI) {
927 return false;
928 }
929 Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
930 std::string namePrefix = LHS->getName().substr(0, 14);
931 if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
932 && namePrefix != "__amdil__imul24_high") {
933 return false;
934 }
935 if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
936 return false;
937 }
938 return true;
939 }
940
941 void
942 AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) {
943 assert(isSigned24BitOps(CI) && "Must be a "
944 "signed 24 bit operation to call this function!");
945 Value *LHS = CI->getOperand(CI->getNumOperands()-1);
946 // On 7XX and 8XX we do not have signed 24bit, so we need to
947 // expand it to the following:
948 // imul24 turns into 32bit imul
949 // imad24 turns into 32bit imad
950 // imul24_high turns into 32bit imulhigh
951 if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
952 Type *aType = CI->getOperand(0)->getType();
953 bool isVector = aType->isVectorTy();
954 int numEle = isVector ? dyn_cast(aType)->getNumElements() : 1;
955 std::vector callTypes;
956 callTypes.push_back(CI->getOperand(0)->getType());
957 callTypes.push_back(CI->getOperand(1)->getType());
958 callTypes.push_back(CI->getOperand(2)->getType());
959 FunctionType *funcType =
960 FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
961 std::string name = "__amdil_imad";
962 if (isVector) {
963 name += "_v" + itostr(numEle) + "i32";
964 } else {
965 name += "_i32";
966 }
967 Function *Func = dyn_cast(
968 CI->getParent()->getParent()->getParent()->
969 getOrInsertFunction(StringRef(name), funcType));
970 Value *Operands[3] = {
971 CI->getOperand(0),
972 CI->getOperand(1),
973 CI->getOperand(2)
974 };
975 CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
976 nCI->insertBefore(CI);
977 CI->replaceAllUsesWith(nCI);
978 } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
979 BinaryOperator *mulOp =
980 BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
981 CI->getOperand(1), "imul24", CI);
982 CI->replaceAllUsesWith(mulOp);
983 } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
984 Type *aType = CI->getOperand(0)->getType();
985
986 bool isVector = aType->isVectorTy();
987 int numEle = isVector ? dyn_cast(aType)->getNumElements() : 1;
988 std::vector callTypes;
989 callTypes.push_back(CI->getOperand(0)->getType());
990 callTypes.push_back(CI->getOperand(1)->getType());
991 FunctionType *funcType =
992 FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
993 std::string name = "__amdil_imul_high";
994 if (isVector) {
995 name += "_v" + itostr(numEle) + "i32";
996 } else {
997 name += "_i32";
998 }
999 Function *Func = dyn_cast(
1000 CI->getParent()->getParent()->getParent()->
1001 getOrInsertFunction(StringRef(name), funcType));
1002 Value *Operands[2] = {
1003 CI->getOperand(0),
1004 CI->getOperand(1)
1005 };
1006 CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
1007 nCI->insertBefore(CI);
1008 CI->replaceAllUsesWith(nCI);
1009 }
1010 }
1011
1012 bool
1013 AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) {
1014 return (CI != NULL
1015 && CI->getOperand(CI->getNumOperands() - 1)->getName()
1016 == "__amdil_get_local_size_int");
1017 }
1018
1019 bool
1020 AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) {
1021 if (!CI) {
1022 return false;
1023 }
1024 if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
1025 && (mSTM->getDeviceName() == "cayman")) {
1026 return false;
1027 }
1028 return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
1029 == "__amdil_improved_div";
1030 }
1031
1032 void
1033 AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) {
1034 assert(convertAccurateDivide(CI)
1035 && "expanding accurate divide can only happen if it is expandable!");
1036 BinaryOperator *divOp =
1037 BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
1038 CI->getOperand(1), "fdiv32", CI);
1039 CI->replaceAllUsesWith(divOp);
1040 }
1041
1042 bool
1043 AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {
1044 if (optLevel != CodeGenOpt::None) {
1045 return false;
1046 }
1047
1048 if (!CI) {
1049 return false;
1050 }
1051
1052 unsigned funcNameIdx = 0;
1053 funcNameIdx = CI->getNumOperands() - 1;
1054 StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
1055 if (calleeName != "__amdil_image2d_read_norm"
1056 && calleeName != "__amdil_image2d_read_unnorm"
1057 && calleeName != "__amdil_image3d_read_norm"
1058 && calleeName != "__amdil_image3d_read_unnorm") {
1059 return false;
1060 }
1061
1062 unsigned samplerIdx = 2;
1063 samplerIdx = 1;
1064 Value *sampler = CI->getOperand(samplerIdx);
1065 LoadInst *lInst = dyn_cast(sampler);
1066 if (!lInst) {
1067 return false;
1068 }
1069
1070 if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1071 return false;
1072 }
1073
1074 GlobalVariable *gv = dyn_cast(lInst->getPointerOperand());
1075 // If we are loading from what is not a global value, then we
1076 // fail and return.
1077 if (!gv) {
1078 return false;
1079 }
1080
1081 // If we don't have an initializer or we have an initializer and
1082 // the initializer is not a 32bit integer, we fail.
1083 if (!gv->hasInitializer()
1084 || !gv->getInitializer()->getType()->isIntegerTy(32)) {
1085 return false;
1086 }
1087
1088 // Now that we have the global variable initializer, lets replace
1089 // all uses of the load instruction with the samplerVal and
1090 // reparse the __amdil_is_constant() function.
1091 Constant *samplerVal = gv->getInitializer();
1092 lInst->replaceAllUsesWith(samplerVal);
1093 return true;
1094 }
1095
1096 bool
1097 AMDGPUPeepholeOpt::doInitialization(Module &M) {
1098 return false;
1099 }
1100
1101 bool
1102 AMDGPUPeepholeOpt::doFinalization(Module &M) {
1103 return false;
1104 }
1105
1106 void
1107 AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const {
1108 AU.addRequired();
1109 FunctionPass::getAnalysisUsage(AU);
1110 AU.setPreservesAll();
1111 }
1112
1113 size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
1114 size_t size = 0;
1115 if (!T) {
1116 return size;
1117 }
1118 switch (T->getTypeID()) {
1119 case Type::X86_FP80TyID:
1120 case Type::FP128TyID:
1121 case Type::PPC_FP128TyID:
1122 case Type::LabelTyID:
1123 assert(0 && "These types are not supported by this backend");
1124 default:
1125 case Type::FloatTyID:
1126 case Type::DoubleTyID:
1127 size = T->getPrimitiveSizeInBits() >> 3;
1128 break;
1129 case Type::PointerTyID:
1130 size = getTypeSize(dyn_cast(T), dereferencePtr);
1131 break;
1132 case Type::IntegerTyID:
1133 size = getTypeSize(dyn_cast(T), dereferencePtr);
1134 break;
1135 case Type::StructTyID:
1136 size = getTypeSize(dyn_cast(T), dereferencePtr);
1137 break;
1138 case Type::ArrayTyID:
1139 size = getTypeSize(dyn_cast(T), dereferencePtr);
1140 break;
1141 case Type::FunctionTyID:
1142 size = getTypeSize(dyn_cast(T), dereferencePtr);
1143 break;
1144 case Type::VectorTyID:
1145 size = getTypeSize(dyn_cast(T), dereferencePtr);
1146 break;
1147 };
1148 return size;
1149 }
1150
1151 size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
1152 bool dereferencePtr) {
1153 size_t size = 0;
1154 if (!ST) {
1155 return size;
1156 }
1157 Type *curType;
1158 StructType::element_iterator eib;
1159 StructType::element_iterator eie;
1160 for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
1161 curType = *eib;
1162 size += getTypeSize(curType, dereferencePtr);
1163 }
1164 return size;
1165 }
1166
1167 size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
1168 bool dereferencePtr) {
1169 return IT ? (IT->getBitWidth() >> 3) : 0;
1170 }
1171
1172 size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
1173 bool dereferencePtr) {
1174 assert(0 && "Should not be able to calculate the size of an function type");
1175 return 0;
1176 }
1177
1178 size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
1179 bool dereferencePtr) {
1180 return (size_t)(AT ? (getTypeSize(AT->getElementType(),
1181 dereferencePtr) * AT->getNumElements())
1182 : 0);
1183 }
1184
1185 size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
1186 bool dereferencePtr) {
1187 return VT ? (VT->getBitWidth() >> 3) : 0;
1188 }
1189
1190 size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
1191 bool dereferencePtr) {
1192 if (!PT) {
1193 return 0;
1194 }
1195 Type *CT = PT->getElementType();
1196 if (CT->getTypeID() == Type::StructTyID &&
1197 PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
1198 return getTypeSize(dyn_cast(CT));
1199 } else if (dereferencePtr) {
1200 size_t size = 0;
1201 for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
1202 size += getTypeSize(PT->getContainedType(x), dereferencePtr);
1203 }
1204 return size;
1205 } else {
1206 return 4;
1207 }
1208 }
1209
1210 size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
1211 bool dereferencePtr) {
1212 //assert(0 && "Should not be able to calculate the size of an opaque type");
1213 return 4;
1214 }
2020 AMDILISelDAGToDAG.cpp
2121 AMDILISelLowering.cpp
2222 AMDILNIDevice.cpp
23 AMDILPeepholeOptimizer.cpp
2423 AMDILSIDevice.cpp
2524 AMDGPUAsmPrinter.cpp
2625 AMDGPUFrameLowering.cpp
16141614 i32:$src2))],
16151615 VecALU
16161616 >;
1617 def : BFEPattern ;
16171618
16181619 def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [], VecALU>;
16191620 defm : BFIPatterns ;
0 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
1
2 ; CHECK: @bfe_def
3 ; CHECK: BFE_UINT
4 define void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
5 entry:
6 %0 = lshr i32 %x, 5
7 %1 = and i32 %0, 15 ; 0xf
8 store i32 %1, i32 addrspace(1)* %out
9 ret void
10 }
11
12 ; This program could be implemented using a BFE_UINT instruction, however
13 ; since the lshr constant + number of bits in the mask is >= 32, it can also be
14 ; implmented with a LSHR instruction, which is better, because LSHR has less
15 ; operands and requires less constants.
16
17 ; CHECK: @bfe_shift
18 ; CHECK-NOT: BFE_UINT
19 define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
20 entry:
21 %0 = lshr i32 %x, 16
22 %1 = and i32 %0, 65535 ; 0xffff
23 store i32 %1, i32 addrspace(1)* %out
24 ret void
25 }