llvm.org GIT mirror llvm / 81c9a29
AMDGPU: Make better use of op_sel with high components Handle more general swizzles. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@303296 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 3 years ago
3 changed file(s) with 486 addition(s) and 10 deletion(s). Raw diff Collapse all Expand all
16991699 return true;
17001700 }
17011701
1702 static SDValue stripBitcast(SDValue Val) {
1703 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
1704 }
1705
1706 // Figure out if this is really an extract of the high 16-bits of a dword.
1707 static bool isExtractHiElt(SDValue In, SDValue &Out) {
1708 In = stripBitcast(In);
1709 if (In.getOpcode() != ISD::TRUNCATE)
1710 return false;
1711
1712 SDValue Srl = In.getOperand(0);
1713 if (Srl.getOpcode() == ISD::SRL) {
1714 if (ConstantSDNode *ShiftAmt = dyn_cast(Srl.getOperand(1))) {
1715 if (ShiftAmt->getZExtValue() == 16) {
1716 Out = stripBitcast(Srl.getOperand(0));
1717 return true;
1718 }
1719 }
1720 }
1721
1722 return false;
1723 }
1724
1725 // Look through operations that obscure just looking at the low 16-bits of the
1726 // same register.
1727 static SDValue stripExtractLoElt(SDValue In) {
1728 if (In.getOpcode() == ISD::TRUNCATE) {
1729 SDValue Src = In.getOperand(0);
1730 if (Src.getValueType().getSizeInBits() == 32)
1731 return stripBitcast(Src);
1732 }
1733
1734 return In;
1735 }
1736
17021737 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
17031738 SDValue &SrcMods) const {
17041739 unsigned Mods = 0;
17051740 Src = In;
17061741
1707 // FIXME: Look for on separate components
17081742 if (Src.getOpcode() == ISD::FNEG) {
17091743 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
17101744 Src = Src.getOperand(0);
17131747 if (Src.getOpcode() == ISD::BUILD_VECTOR) {
17141748 unsigned VecMods = Mods;
17151749
1716 SDValue Lo = Src.getOperand(0);
1717 SDValue Hi = Src.getOperand(1);
1750 SDValue Lo = stripBitcast(Src.getOperand(0));
1751 SDValue Hi = stripBitcast(Src.getOperand(1));
17181752
17191753 if (Lo.getOpcode() == ISD::FNEG) {
1720 Lo = Lo.getOperand(0);
1754 Lo = stripBitcast(Lo.getOperand(0));
17211755 Mods ^= SISrcMods::NEG;
17221756 }
17231757
17241758 if (Hi.getOpcode() == ISD::FNEG) {
1725 Hi = Hi.getOperand(0);
1759 Hi = stripBitcast(Hi.getOperand(0));
17261760 Mods ^= SISrcMods::NEG_HI;
17271761 }
1762
1763 if (isExtractHiElt(Lo, Lo))
1764 Mods |= SISrcMods::OP_SEL_0;
1765
1766 if (isExtractHiElt(Hi, Hi))
1767 Mods |= SISrcMods::OP_SEL_1;
1768
1769 Lo = stripExtractLoElt(Lo);
1770 Hi = stripExtractLoElt(Hi);
17281771
17291772 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
17301773 // Really a scalar input. Just select from the low half of the register to
17391782 }
17401783
17411784 // Packed instructions do not have abs modifiers.
1742
1743 // FIXME: Handle abs/neg of individual components.
1744 // FIXME: Handle swizzling with op_sel
17451785 Mods |= SISrcMods::OP_SEL_1;
17461786
17471787 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
26592659 SDValue Vec = Op.getOperand(0);
26602660 SDValue Idx = Op.getOperand(1);
26612661
2662 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
2663
2664 // Make sure we we do any optimizations that will make it easier to fold
2665 // source modifiers before obscuring it with bit operations.
2666
2667 // XXX - Why doesn't this get called when vector_shuffle is expanded?
2668 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
2669 return Combined;
2670
26622671 if (const ConstantSDNode *CIdx = dyn_cast(Idx)) {
26632672 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
26642673
180180 ; GCN-NOT: shl
181181 ; GCN-NOT: or
182182
183 ; GCN: v_xor_b32_e32 [[NEG_SCALAR0:v[0-9]+]], 0x8000, [[SCALAR0]]
184 ; GCN-NEXT: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[NEG_SCALAR0]] op_sel_hi:[1,0]{{$}}
183 ; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[SCALAR0]] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
185184 define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
186185 bb:
187186 %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
259258 ret void
260259 }
261260
261 ; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_hi:
262 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
263 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
264 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
265
266 ; GCN-NOT: pack
267 ; GCN-NOT: and
268 ; GCN-NOT: shl
269 ; GCN-NOT: or
270
271 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
272 define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
273 bb:
274 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
275 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
276
277 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
278 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
279 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
280
281 %vec2.fneg = fsub <2 x half> , %vec2
282 %vec2.fneg.elt1.broadcast = shufflevector <2 x half> %vec2.fneg, <2 x half> undef, <2 x i32>
283
284 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.fneg.elt1.broadcast)
285 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
286 ret void
287 }
288
289 ; GCN-LABEL: {{^}}fma_vector_vector_vector_neg_hi:
290 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
291 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
292 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
293
294 ; GCN-NOT: pack
295 ; GCN-NOT: and
296 ; GCN-NOT: shl
297 ; GCN-NOT: or
298
299 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}}
300 define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
301 bb:
302 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
303 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
304
305 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
306 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
307 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
308
309 %vec2.elt1 = extractelement <2 x half> %vec2, i32 1
310 %neg.vec2.elt1 = fsub half -0.0, %vec2.elt1
311
312 %neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.vec2.elt1, i32 1
313 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.elt1.insert)
314 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
315 ret void
316 }
317
318 ; GCN-LABEL: {{^}}add_vector_scalar_hi:
319 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
320 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
321
322 ; GCN-NOT: pack
323 ; GCN-NOT: and
324 ; GCN-NOT: shl
325 ; GCN-NOT: or
326
327 ; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[VEC1]] op_sel:[0,1]{{$}}
328 define amdgpu_kernel void @add_vector_scalar_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 {
329 bb:
330 %lds.gep1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(3)* %lds, i32 1
331
332 %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
333 %vec1 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds.gep1, align 4
334
335 %vec1.elt1.broadcast = shufflevector <2 x i16> %vec1, <2 x i16> undef, <2 x i32>
336 %result = add <2 x i16> %vec0, %vec1.elt1.broadcast
337
338 store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
339 ret void
340 }
341
342 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_hi:
343 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
344 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
345 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
346
347 ; GCN-NOT: pack
348 ; GCN-NOT: and
349 ; GCN-NOT: shl
350 ; GCN-NOT: or
351
352 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1]{{$}}
353 define amdgpu_kernel void @fma_vector_vector_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
354 bb:
355 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
356 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
357
358 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
359 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
360 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
361
362 %vec2.elt1.broadcast = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32>
363
364 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.elt1.broadcast)
365
366 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
367 ret void
368 }
369
370 ; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_lo_neg_hi:
371 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
372 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
373 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
374
375 ; GCN-NOT: pack
376 ; GCN-NOT: and
377 ; GCN-NOT: shl
378 ; GCN-NOT: or
379
380 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]]{{$}}
381 define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
382 bb:
383 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
384 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
385
386 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
387 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
388 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
389
390 %neg.vec2 = fsub <2 x half> , %vec2
391 %neg.vec2.elt1 = extractelement <2 x half> %neg.vec2, i32 1
392 %neg.neg.vec2.elt1 = fsub half -0.0, %neg.vec2.elt1
393 %neg.neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.neg.vec2.elt1, i32 1
394
395 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.vec2.elt1.insert)
396 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
397 ret void
398 }
399
400 ; GCN-LABEL: {{^}}fma_vector_vector_swap_vector:
401 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
402 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
403 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
404
405 ; GCN-NOT: pack
406 ; GCN-NOT: and
407 ; GCN-NOT: shl
408 ; GCN-NOT: or
409
410 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
411 define amdgpu_kernel void @fma_vector_vector_swap_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
412 bb:
413 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
414 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
415
416 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
417 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
418 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
419
420 %vec2.swap = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32>
421 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.swap)
422
423 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
424 ret void
425 }
426
427 ; GCN-LABEL: {{^}}fma_vector_vector_swap_neg_vector:
428 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
429 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
430 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
431
432 ; GCN-NOT: pack
433 ; GCN-NOT: and
434 ; GCN-NOT: shl
435 ; GCN-NOT: or
436 ; GCN-NOT: xor
437
438 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
439 define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
440 bb:
441 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
442 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
443
444 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
445 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
446 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
447 %neg.vec2 = fsub <2 x half> , %vec2
448
449 %neg.vec2.swap = shufflevector <2 x half> %neg.vec2, <2 x half> undef, <2 x i32>
450 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.swap)
451
452 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
453 ret void
454 }
455
456 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_0:
457 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
458 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
459 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
460
461 ; GCN-NOT: pack
462 ; GCN-NOT: and
463 ; GCN-NOT: shl
464 ; GCN-NOT: or
465 ; GCN-NOT: xor
466
467 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
468 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
469 bb:
470 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
471 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
472
473 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
474 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
475 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
476 %neg.vec2 = fsub <2 x half> , %vec2
477 %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32>
478 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
479
480 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
481 ret void
482 }
483
484 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_1:
485 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
486 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
487 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
488
489 ; GCN-NOT: pack
490 ; GCN-NOT: and
491 ; GCN-NOT: shl
492 ; GCN-NOT: or
493 ; GCN-NOT: xor
494
495 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_lo:[0,0,1]{{$}}
496 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
497 bb:
498 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
499 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
500
501 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
502 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
503 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
504 %neg.vec2 = fsub <2 x half> , %vec2
505 %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32>
506 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
507
508 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
509 ret void
510 }
511
512 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_2:
513 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
514 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
515 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
516
517 ; GCN-NOT: pack
518 ; GCN-NOT: and
519 ; GCN-NOT: shl
520 ; GCN-NOT: or
521 ; GCN-NOT: xor
522
523 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}}
524 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
525 bb:
526 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
527 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
528
529 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
530 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
531 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
532 %neg.vec2 = fsub <2 x half> , %vec2
533 %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32>
534 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
535
536 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
537 ret void
538 }
539
540 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_3:
541 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
542 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
543 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
544
545 ; GCN-NOT: pack
546 ; GCN-NOT: and
547 ; GCN-NOT: shl
548 ; GCN-NOT: or
549 ; GCN-NOT: xor
550
551 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1]{{$}}
552 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
553 bb:
554 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
555 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
556
557 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
558 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
559 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
560 %neg.vec2 = fsub <2 x half> , %vec2
561 %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32>
562 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
563
564 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
565 ret void
566 }
567
568 ; GCN-LABEL: {{^}}bitcast_fneg_f32:
569 ; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
570 define amdgpu_kernel void @bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
571 bb:
572 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
573 %f32 = load volatile float, float addrspace(3)* undef, align 4
574 %neg.f32 = fsub float -0.0, %f32
575 %bc = bitcast float %neg.f32 to <2 x half>
576 %result = fadd <2 x half> %vec0, %bc
577
578 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
579 ret void
580 }
581
582 ; GCN-LABEL: {{^}}shuffle_bitcast_fneg_f32:
583 ; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} op_sel:[0,1] op_sel_hi:[1,0]{{$}}
584 define amdgpu_kernel void @shuffle_bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
585 bb:
586 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
587
588 %f32 = load volatile float, float addrspace(3)* undef, align 4
589 %neg.f32 = fsub float -0.0, %f32
590 %bc = bitcast float %neg.f32 to <2 x half>
591 %shuf = shufflevector <2 x half> %bc, <2 x half> undef, <2 x i32>
592 %result = fadd <2 x half> %vec0, %shuf
593 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
594 ret void
595 }
596
597 ; GCN-LABEL: {{^}}extract_from_i64:
598 ; GCN: v_lshl_or_b32
599 ; GCN: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
600 define amdgpu_kernel void @extract_from_i64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 {
601 bb:
602 %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
603 %i64 = load volatile i64, i64 addrspace(1)* undef
604
605 %elt0 = trunc i64 %i64 to i16
606 %hi = lshr i64 %i64, 16
607 %elt1 = trunc i64 %hi to i16
608
609 %ins0 = insertelement <2 x i16> undef, i16 %elt1, i32 0
610 %ins1 = insertelement <2 x i16> %ins0, i16 %elt0, i32 1
611 %result = add <2 x i16> %vec0, %ins1
612 store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
613 ret void
614 }
615
616
617 ; Bitcast is final obstacle to identifying same source register
618 ; GCN-LABEL: {{^}}bitcast_lo_elt_op_sel:
619 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
620 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
621 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
622
623 ; GCN-NOT: pack
624 ; GCN-NOT: and
625 ; GCN-NOT: shl
626 ; GCN-NOT: _or
627
628 ; GCN: v_pk_add_f16 [[FADD:v[0-9]+]]
629 ; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
630 define amdgpu_kernel void @bitcast_lo_elt_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
631 bb:
632 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
633 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
634
635 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
636 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
637 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
638
639 %scalar0 = load volatile i16, i16 addrspace(1)* undef
640 %shl = shl i16 %scalar0, 1
641 %shl.bc = bitcast i16 %shl to half
642
643 %fadd = fadd <2 x half> %vec2,
644 %shuffle = shufflevector <2 x half> %fadd, <2 x half> %vec2, <2 x i32>
645
646 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %shuffle)
647 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
648 ret void
649 }
650
651
652 ; Bitcast is final obstacle to identifying same source register
653 ; GCN-LABEL: {{^}}mix_elt_types_op_sel:
654 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
655 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
656 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
657
658 ; GCN-NOT: pack
659 ; GCN-NOT: and
660 ; GCN-NOT: shl
661 ; GCN-NOT: _or
662
663 ; GCN: v_pk_add_f16 [[FADD:v[0-9]+]]
664 ; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
665 define amdgpu_kernel void @mix_elt_types_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
666 bb:
667 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
668 %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
669
670 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
671 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
672 %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
673
674 %scalar0 = load volatile i16, i16 addrspace(1)* undef
675 %scalar1 = load volatile half, half addrspace(1)* undef
676 %shl = shl i16 %scalar0, 1
677 %shl.bc = bitcast i16 %shl to half
678
679 %insert0 = insertelement <2 x half> undef, half %shl.bc, i32 0
680
681 %fadd = fadd <2 x half> %vec2,
682 %insert1 = shufflevector <2 x half> %fadd, <2 x half> %insert0, <2 x i32>
683
684 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %insert1)
685 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
686 ret void
687 }
688
262689 declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
263690
264691 attributes #0 = { nounwind }