llvm.org GIT mirror llvm / 39c7035
AMDGPU: Undo sub x, c canonicalization for v2i16 Should avoid regression from D62341 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363899 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault a month ago
7 changed file(s) with 140 addition(s) and 82 deletion(s). Raw diff Collapse all Expand all
6666
6767 namespace {
6868
69 static bool getConstantValue(SDValue N, uint32_t &Out) {
70 if (const ConstantSDNode *C = dyn_cast(N)) {
71 Out = C->getAPIntValue().getSExtValue();
72 return true;
73 }
74
75 if (const ConstantFPSDNode *C = dyn_cast(N)) {
76 Out = C->getValueAPF().bitcastToAPInt().getSExtValue();
77 return true;
78 }
79
80 return false;
81 }
82
83 // TODO: Handle undef as zero
84 static SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
85 bool Negate = false) {
86 assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
87 uint32_t LHSVal, RHSVal;
88 if (getConstantValue(N->getOperand(0), LHSVal) &&
89 getConstantValue(N->getOperand(1), RHSVal)) {
90 SDLoc SL(N);
91 uint32_t K = Negate ?
92 (-LHSVal & 0xffff) | (-RHSVal << 16) :
93 (LHSVal & 0xffff) | (RHSVal << 16);
94 return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
95 DAG.getTargetConstant(K, SL, MVT::i32));
96 }
97
98 return nullptr;
99 }
100
101 static SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
102 return packConstantV2I16(N, DAG, true);
103 }
104
69105 /// AMDGPU specific code to select AMDGPU machine instructions for
70106 /// SelectionDAG operations.
71107 class AMDGPUDAGToDAGISel : public SelectionDAGISel {
103139 private:
104140 std::pair foldFrameIndex(SDValue N) const;
105141 bool isNoNanSrc(SDValue N) const;
106 bool isInlineImmediate(const SDNode *N) const;
142 bool isInlineImmediate(const SDNode *N, bool Negated = false) const;
143 bool isNegInlineImmediate(const SDNode *N) const {
144 return isInlineImmediate(N, true);
145 }
146
107147 bool isVGPRImm(const SDNode *N) const;
108148 bool isUniformLoad(const SDNode *N) const;
109149 bool isUniformBr(const SDNode *N) const;
436476 return CurDAG->isKnownNeverNaN(N);
437477 }
438478
439 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
479 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
480 bool Negated) const {
481 // TODO: Handle undef
482
440483 const SIInstrInfo *TII = Subtarget->getInstrInfo();
441
442 if (const ConstantSDNode *C = dyn_cast(N))
443 return TII->isInlineConstant(C->getAPIntValue());
444
445 if (const ConstantFPSDNode *C = dyn_cast(N))
446 return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
484 if (Negated) {
485 if (const ConstantSDNode *C = dyn_cast(N))
486 return TII->isInlineConstant(-C->getAPIntValue());
487
488 if (const ConstantFPSDNode *C = dyn_cast(N))
489 return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
490
491 } else {
492 if (const ConstantSDNode *C = dyn_cast(N))
493 return TII->isInlineConstant(C->getAPIntValue());
494
495 if (const ConstantFPSDNode *C = dyn_cast(N))
496 return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
497 }
447498
448499 return false;
449500 }
562613 llvm_unreachable("invalid vector size");
563614 }
564615
565 static bool getConstantValue(SDValue N, uint32_t &Out) {
566 if (const ConstantSDNode *C = dyn_cast(N)) {
567 Out = C->getAPIntValue().getZExtValue();
568 return true;
569 }
570
571 if (const ConstantFPSDNode *C = dyn_cast(N)) {
572 Out = C->getValueAPF().bitcastToAPInt().getZExtValue();
573 return true;
574 }
575
576 return false;
577 }
578
579616 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
580617 EVT VT = N->getValueType(0);
581618 unsigned NumVectorElts = VT.getVectorNumElements();
684721 unsigned NumVectorElts = VT.getVectorNumElements();
685722 if (VT.getScalarSizeInBits() == 16) {
686723 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
687 uint32_t LHSVal, RHSVal;
688 if (getConstantValue(N->getOperand(0), LHSVal) &&
689 getConstantValue(N->getOperand(1), RHSVal)) {
690 uint32_t K = LHSVal | (RHSVal << 16);
691 CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT,
692 CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32));
724 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
725 ReplaceNode(N, Packed);
693726 return;
694727 }
695728 }
604604 def ShiftAmt32Imm : PatLeaf <(imm), [{
605605 return N->getZExtValue() < 32;
606606 }]>;
607
608 def getNegV2I16Imm : SDNodeXForm
609 return SDValue(packNegConstantV2I16(N, *CurDAG), 0);
610 }]>;
611
612
613 // TODO: Handle undef as 0
614 def NegSubInlineConstV216 : PatLeaf<(build_vector), [{
615 assert(N->getNumOperands() == 2);
616 assert(N->getOperand(0).getValueType().getSizeInBits() == 16);
617 SDValue Src0 = N->getOperand(0);
618 SDValue Src1 = N->getOperand(1);
619 if (Src0 == Src1)
620 return isNegInlineImmediate(Src0.getNode());
621
622 return (isNullConstant(Src0) && isNegInlineImmediate(Src1.getNode())) ||
623 (isNullConstant(Src1) && isNegInlineImmediate(Src0.getNode()));
624 }], getNegV2I16Imm>;
607625
608626 //===----------------------------------------------------------------------===//
609627 // Custom Operands
6767 def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile, lshl_rev>;
6868 def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile, ashr_rev>;
6969 def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile, lshr_rev>;
70
71
72 // Undo sub x, c -> add x, -c canonicalization since c is more likely
73 // an inline immediate than -c.
74 // The constant will be emitted as a mov, and folded later.
75 // TODO: We could directly encode the immediate now
76 def : GCNPat<
77 (add (v2i16 (VOP3PMods0 v2i16:$src0, i32:$src0_modifiers, i1:$clamp)), NegSubInlineConstV216:$src1),
78 (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1, $clamp)
79 >;
7080
7181 multiclass MadFmaMixPats
7282 Instruction mix_inst,
102102 }
103103
104104 ; GCN-LABEL: {{^}}v_test_add_v2i16_inline_neg1:
105 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, -1 op_sel_hi:[1,0]{{$}}
105 ; GFX9: v_pk_sub_u16 v{{[0-9]+}}, v{{[0-9]+}}, 1 op_sel_hi:[1,0]{{$}}
106106
107107 ; VI: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1
108108 ; VI: flat_load_dword [[LOAD:v[0-9]+]]
180180 ; GCN-LABEL: {{^}}chain_hi_to_lo_group_other_dep:
181181 ; GFX900: ds_read_u16_d16_hi v1, v0
182182 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
183 ; GFX900-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
183 ; GFX900-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
184184 ; GFX900-NEXT: ds_read_u16_d16 v1, v0 offset:2
185185 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
186186 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
203203 ; GFX900-NEXT: ds_read_u16_d16_hi v0, v0
204204 ; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
205205 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
206 ; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
206 ; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
207207 ; GFX900-NEXT: v_bfi_b32 v0, [[MASK]], v1, v0
208208 ; GFX900-NEXT: s_setpc_b64
209209 define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) {
221221 ; GCN-LABEL: {{^}}chain_hi_to_lo_private_other_dep:
222222 ; GFX900: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen
223223 ; GFX900-NEXT: s_waitcnt vmcnt(0)
224 ; GFX900-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
224 ; GFX900-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
225225 ; GFX900-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s4 offen offset:2
226226 ; GFX900-NEXT: s_waitcnt vmcnt(0)
227227 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
243243 ; GFX900-NEXT: global_load_short_d16_hi v0, v[0:1], off
244244 ; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
245245 ; GFX900-NEXT: s_waitcnt vmcnt(0)
246 ; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
246 ; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
247247 ; GFX900-NEXT: v_bfi_b32 v0, [[MASK]], v2, v0
248248 ; GFX900-NEXT: s_setpc_b64
249249 define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) {
263263 ; GFX900-NEXT: flat_load_short_d16_hi v0, v[0:1]
264264 ; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
265265 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
266 ; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
266 ; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
267267 ; GFX900-NEXT: v_bfi_b32 v0, v1, v2, v0
268268 ; GFX900-NEXT: s_setpc_b64
269269 define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) {
12321232 ; GFX9: ; %bb.0:
12331233 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
12341234 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1235 ; GFX9-NEXT: s_movk_i32 s4, 0xffe0
1236 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1237 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1238 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1239 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1240 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1241 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1242 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1243 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1244 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1245 ; GFX9-NEXT: v_pk_add_u16 v2, v3, s4 op_sel_hi:[1,0]
1235 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1236 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1237 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1238 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1239 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1240 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1241 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1242 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1243 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1244 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel_hi:[1,0]
12461245 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
12471246 ; GFX9-NEXT: s_endpgm
12481247 %tid = call i32 @llvm.amdgcn.workitem.id.x()
12951294 ; GFX9: ; %bb.0:
12961295 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
12971296 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1298 ; GFX9-NEXT: s_mov_b32 s4, 0xffe00000
1299 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1300 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1301 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1302 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1303 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1304 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1305 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1306 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1307 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1308 ; GFX9-NEXT: v_pk_add_u16 v2, v3, s4
1297 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1298 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1299 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1300 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1301 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1302 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1303 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1304 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1305 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1306 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0]
13091307 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
13101308 ; GFX9-NEXT: s_endpgm
13111309 %tid = call i32 @llvm.amdgcn.workitem.id.x()
13611359 ; GFX9: ; %bb.0:
13621360 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
13631361 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1364 ; GFX9-NEXT: s_mov_b32 s4, 0xffe0
1365 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1366 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1367 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1368 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1369 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1370 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1371 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1372 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1373 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1374 ; GFX9-NEXT: v_pk_add_u16 v2, v3, s4
1362 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1363 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1364 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1365 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1366 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1367 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1368 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1369 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1370 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1371 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32
13751372 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
13761373 ; GFX9-NEXT: s_endpgm
13771374 %tid = call i32 @llvm.amdgcn.workitem.id.x()
14391436 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
14401437 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
14411438 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1442 ; GFX9-NEXT: v_pk_add_u16 v2, v3, -16 op_sel_hi:[1,0]
1439 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel_hi:[1,0]
14431440 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
14441441 ; GFX9-NEXT: s_endpgm
14451442 %tid = call i32 @llvm.amdgcn.workitem.id.x()
15011498 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
15021499 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
15031500 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1504 ; GFX9-NEXT: v_pk_add_u16 v2, v3, -16 op_sel:[0,1] op_sel_hi:[1,0]
1501 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0]
15051502 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
15061503 ; GFX9-NEXT: s_endpgm
15071504 %tid = call i32 @llvm.amdgcn.workitem.id.x()
15661563 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
15671564 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
15681565 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1569 ; GFX9-NEXT: v_pk_add_u16 v2, v3, -16
1566 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16
15701567 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
15711568 ; GFX9-NEXT: s_endpgm
15721569 %tid = call i32 @llvm.amdgcn.workitem.id.x()
16331630 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
16341631 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
16351632 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1636 ; GFX9-NEXT: v_pk_add_u16 v2, v3, -4.0 op_sel_hi:[1,0]
1633 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 1.0 op_sel_hi:[1,0]
16371634 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
16381635 ; GFX9-NEXT: s_endpgm
16391636 %tid = call i32 @llvm.amdgcn.workitem.id.x()
17001697 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
17011698 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
17021699 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1703 ; GFX9-NEXT: v_pk_add_u16 v2, v3, 4.0 op_sel_hi:[1,0]
1700 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, -1.0 op_sel_hi:[1,0]
17041701 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
17051702 ; GFX9-NEXT: s_endpgm
17061703 %tid = call i32 @llvm.amdgcn.workitem.id.x()
17671764 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
17681765 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
17691766 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1770 ; GFX9-NEXT: v_pk_add_u16 v2, v3, 2.0 op_sel_hi:[1,0]
1767 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, -2.0 op_sel_hi:[1,0]
17711768 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
17721769 ; GFX9-NEXT: s_endpgm
17731770 %tid = call i32 @llvm.amdgcn.workitem.id.x()
18341831 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
18351832 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
18361833 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1837 ; GFX9-NEXT: v_pk_add_u16 v2, v3, -2.0 op_sel_hi:[1,0]
1834 ; GFX9-NEXT: v_pk_sub_u16 v2, v3, 2.0 op_sel_hi:[1,0]
18381835 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
18391836 ; GFX9-NEXT: s_endpgm
18401837 %tid = call i32 @llvm.amdgcn.workitem.id.x()
55 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
66 ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
77 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
8 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
8 ; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0]
99
1010 ; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
1111 ; CIVI: s_sub_i32
2929 ; GFX9: global_load_dword [[VAL:v[0-9]+]]
3030 ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
3131 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
32 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
32 ; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0]
3333
3434 ; VI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
3535 ; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
6969 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
7070 ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
7171 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
72 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
72 ; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0]
7373 define amdgpu_kernel void @s_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
7474 %z0 = insertelement <2 x i16> undef, i16 0, i16 0
7575 %z1 = insertelement <2 x i16> %z0, i16 0, i16 1
8787 ; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
8888 ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
8989 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
90 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
90 ; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0]
9191 define amdgpu_kernel void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
9292 %z0 = insertelement <2 x i16> undef, i16 0, i16 0
9393 %z1 = insertelement <2 x i16> %z0, i16 0, i16 1
108108 ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, s[[VAL1]]
109109 ; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], s[[VAL0]], [[SUB0]]
110110 ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], s[[VAL1]], [[SUB1]]
111 ; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 op_sel_hi:[1,0]
112 ; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 op_sel_hi:[1,0]
111 ; GFX9-DAG: v_pk_sub_u16 [[ADD0:v[0-9]+]], [[MAX0]], -2 op_sel_hi:[1,0]
112 ; GFX9-DAG: v_pk_sub_u16 [[ADD1:v[0-9]+]], [[MAX1]], -2 op_sel_hi:[1,0]
113113 define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 {
114114 %z0 = insertelement <4 x i16> undef, i16 0, i16 0
115115 %z1 = insertelement <4 x i16> %z0, i16 0, i16 1
132132
133133 ; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, v[[VAL0]]
134134 ; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], v[[VAL0]], [[SUB0]]
135 ; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2
135 ; GFX9-DAG: v_pk_sub_u16 [[ADD0:v[0-9]+]], [[MAX0]], -2 op_sel_hi:[1,0]
136136
137137 ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, v[[VAL1]]
138138 ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], v[[VAL1]], [[SUB1]]
139 ; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2
139 ; GFX9-DAG: v_pk_sub_u16 [[ADD1:v[0-9]+]], [[MAX1]], -2 op_sel_hi:[1,0]
140140 define amdgpu_kernel void @v_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %src) #0 {
141141 %z0 = insertelement <4 x i16> undef, i16 0, i16 0
142142 %z1 = insertelement <4 x i16> %z0, i16 0, i16 1