llvm.org GIT mirror llvm / 65b74e1
Add support for 256-bit versions of VPERMIL instruction. This is a new instruction introduced in AVX, which can operate on 128 and 256-bit vectors. It considers a 256-bit vector as two independent 128-bit lanes. It can permute any 32 or 64 elements inside a lane, and restricts the second lane to have the same permutation of the first one. With the improved splat support introduced early today, adding codegen for this instruction enable more efficient 256-bit code: Instead of: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vextractf128 $1, %ymm0, %xmm1 shufps $1, %xmm1, %xmm1 movss %xmm1, 28(%rsp) movss %xmm1, 24(%rsp) movss %xmm1, 20(%rsp) movss %xmm1, 16(%rsp) vextractf128 $0, %ymm0, %xmm0 shufps $1, %xmm0, %xmm0 movss %xmm0, 12(%rsp) movss %xmm0, 8(%rsp) movss %xmm0, 4(%rsp) movss %xmm0, (%rsp) vmovaps (%rsp), %ymm0 We get: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vpermilps $85, %ymm0, %ymm0 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@135662 91177308-0d34-0410-b5e6-96231b3b80d8 Bruno Cardoso Lopes 9 years ago
8 changed file(s) with 137 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
202202 // FALL THROUGH.
203203 case X86::UNPCKHPSrm:
204204 DecodeUNPCKHPMask(4, ShuffleMask);
205 Src1Name = getRegName(MI->getOperand(0).getReg());
206 break;
207 case X86::VPERMILPSYri:
208 DecodeVPERMILPSMask(8, MI->getOperand(2).getImm(),
209 ShuffleMask);
210 Src1Name = getRegName(MI->getOperand(0).getReg());
211 break;
212 case X86::VPERMILPDYri:
213 DecodeVPERMILPSMask(4, MI->getOperand(2).getImm(),
214 ShuffleMask);
205215 Src1Name = getRegName(MI->getOperand(0).getReg());
206216 break;
207217 }
186186 }
187187 }
188188
189 void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm,
190 SmallVectorImpl &ShuffleMask) {
191 DecodeVPERMILMask(MVT::getVectorVT(MVT::i32, NElts), Imm, ShuffleMask);
192 }
193
194 void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm,
195 SmallVectorImpl &ShuffleMask) {
196 DecodeVPERMILMask(MVT::getVectorVT(MVT::i64, NElts), Imm, ShuffleMask);
197 }
198
199 // DecodeVPERMILMask - Decodes VPERMIL permutes for any 128-bit
200 // with 32/64-bit elements. For 256-bit vectors, it's considered
201 // as two 128 lanes and the mask of the first lane should be
202 // identical of the second one.
203 void DecodeVPERMILMask(EVT VT, unsigned Imm,
204 SmallVectorImpl &ShuffleMask) {
205 unsigned NumElts = VT.getVectorNumElements();
206 unsigned NumLanes = VT.getSizeInBits()/128;
207
208 for (unsigned l = 0; l != NumLanes; ++l) {
209 for (unsigned i = 0; i != NumElts/NumLanes; ++i) {
210 unsigned Idx = (Imm >> (i*2)) & 0x3 ;
211 ShuffleMask.push_back(Idx+(l*NumElts/NumLanes));
212 }
213 }
214 }
215
189216 } // llvm namespace
8181 void DecodeUNPCKLPMask(EVT VT,
8282 SmallVectorImpl &ShuffleMask);
8383
84
85 void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm,
86 SmallVectorImpl &ShuffleMask);
87
88 void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm,
89 SmallVectorImpl &ShuffleMask);
90
91 // DecodeVPERMILMask - Decodes VPERMIL permutes for any 128-bit
92 // with 32/64-bit elements. For 256-bit vectors, it's considered
93 // as two 128 lanes and the mask of the first lane should be
94 // identical of the second one.
95 void DecodeVPERMILMask(EVT VT, unsigned Imm,
96 SmallVectorImpl &ShuffleMask);
97
8498 } // llvm namespace
8599
86100 #endif
27462746 case X86ISD::PUNPCKHBW:
27472747 case X86ISD::PUNPCKHDQ:
27482748 case X86ISD::PUNPCKHQDQ:
2749 case X86ISD::VPERMIL:
27492750 return true;
27502751 }
27512752 return false;
27712772 case X86ISD::PSHUFD:
27722773 case X86ISD::PSHUFHW:
27732774 case X86ISD::PSHUFLW:
2775 case X86ISD::VPERMIL:
27742776 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
27752777 }
27762778
34193421 SmallVector M;
34203422 N->getMask(M);
34213423 return ::isMOVLMask(M, N->getValueType(0));
3424 }
3425
3426 /// isVPERMILMask - Return true if the specified VECTOR_SHUFFLE operand
3427 /// specifies a shuffle of elements that is suitable for input to VPERMIL*.
3428 static bool isVPERMILMask(const SmallVectorImpl &Mask, EVT VT) {
3429 unsigned NumElts = VT.getVectorNumElements();
3430 unsigned NumLanes = VT.getSizeInBits()/128;
3431
3432 // Match any permutation of 128-bit vector with 32/64-bit types
3433 if (NumLanes == 1) {
3434 if (NumElts == 4 || NumElts == 2)
3435 return true;
3436 return false;
3437 }
3438
3439 // Only match 256-bit with 32/64-bit types
3440 if (NumElts != 8 && NumElts != 4)
3441 return false;
3442
3443 // The mask on the high lane should be the same as the low. Actually,
3444 // they can differ if any of the corresponding index in a lane is undef.
3445 int LaneSize = NumElts/NumLanes;
3446 for (int i = 0; i < LaneSize; ++i) {
3447 int HighElt = i+LaneSize;
3448 if (Mask[i] < 0 || Mask[HighElt] < 0)
3449 continue;
3450
3451 if (Mask[HighElt]-Mask[i] != LaneSize)
3452 return false;
3453 }
3454
3455 return true;
3456 }
3457
3458 /// getShuffleVPERMILImmediateediate - Return the appropriate immediate to shuffle
3459 /// the specified VECTOR_MASK mask with VPERMIL* instructions.
3460 static unsigned getShuffleVPERMILImmediate(SDNode *N) {
3461 ShuffleVectorSDNode *SVOp = cast(N);
3462 EVT VT = SVOp->getValueType(0);
3463
3464 int NumElts = VT.getVectorNumElements();
3465 int NumLanes = VT.getSizeInBits()/128;
3466
3467 unsigned Mask = 0;
3468 for (int i = 0; i < NumElts/NumLanes /* lane size */; ++i)
3469 Mask |= SVOp->getMaskElt(i) << (i*2);
3470
3471 return Mask;
34223472 }
34233473
34243474 /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
40964146 return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
40974147 Depth+1);
40984148 }
4149 case X86ISD::VPERMIL:
4150 ImmN = N->getOperand(N->getNumOperands()-1);
4151 DecodeVPERMILMask(VT, cast(ImmN)->getZExtValue(),
4152 ShuffleMask);
40994153 default:
41004154 assert("not implemented for target shuffle node");
41014155 return SDValue();
60416095 // Handle all 4 wide cases with a number of shuffles.
60426096 if (NumElems == 4)
60436097 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
6098
6099 // Handle VPERMIL permutations
6100 if (isVPERMILMask(M, VT)) {
6101 unsigned TargetMask = getShuffleVPERMILImmediate(SVOp);
6102 if (VT == MVT::v8f32)
6103 return getTargetShuffleNode(X86ISD::VPERMIL, dl, VT, V1, TargetMask, DAG);
6104 }
60446105
60456106 return SDValue();
60466107 }
96599720 case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD";
96609721 case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ";
96619722 case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ";
9723 case X86ISD::VPERMIL: return "X86ISD::VPERMIL";
96629724 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
96639725 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
96649726 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
1246412526 case X86ISD::PSHUFLW:
1246512527 case X86ISD::MOVSS:
1246612528 case X86ISD::MOVSD:
12529 case X86ISD::VPERMIL:
1246712530 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI);
1246812531 }
1246912532
270270 PUNPCKHWD,
271271 PUNPCKHDQ,
272272 PUNPCKHQDQ,
273 VPERMIL,
273274
274275 // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
275276 // according to %al. An operator is needed so that this can be expanded
149149 def X86Punpckhdq : SDNode<"X86ISD::PUNPCKHDQ", SDTShuff2Op>;
150150 def X86Punpckhqdq : SDNode<"X86ISD::PUNPCKHQDQ", SDTShuff2Op>;
151151
152 def X86VPermil : SDNode<"X86ISD::VPERMIL", SDTShuff2OpI>;
153
152154 //===----------------------------------------------------------------------===//
153155 // SSE Complex Patterns
154156 //===----------------------------------------------------------------------===//
55285528 // The AVX version of some but not all of them are described here, and more
55295529 // should come in a near future.
55305530
5531 // Shuffle with VPERMIL instructions
5532 def : Pat<(v8f32 (X86VPermil VR256:$src1, (i8 imm:$imm))),
5533 (VPERMILPSYri VR256:$src1, imm:$imm)>;
5534
55315535 // Shuffle with PSHUFD instruction folding loads. The first two patterns match
55325536 // SSE2 loads, which are always promoted to v2i64. The last one should match
55335537 // the SSE1 case, where the only legal load is v4f32, but there is no PSHUFD
0 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
1
2 ; FIXME: use avx versions for punpcklbw and punpckhbw
3
4 ; CHECK: vextractf128 $0
5 ; CHECK-NEXT: punpcklbw
6 ; CHECK-NEXT: punpckhbw
7 ; CHECK-NEXT: vinsertf128 $0
8 ; CHECK-NEXT: vinsertf128 $1
9 ; CHECK-NEXT: vpermilps $85
10 define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
11 entry:
12 %shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32>
13 ret <32 x i8> %shuffle
14 }
15