llvm.org GIT mirror llvm / a4ee850
[X86][AVX2] Add support for combining v16i16 shuffles to VPBLENDW git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@298929 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 3 years ago
4 changed file(s) with 61 addition(s) and 42 deletion(s). Raw diff Collapse all Expand all
2713927139 }
2714027140
2714127141 // Attempt to combine to X86ISD::BLENDI.
27142 // TODO - add 16i16 support (requires lane duplication).
27143 if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
27144 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) {
27142 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
27143 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
27144 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
2714527145 uint64_t BlendMask = 0;
2714627146 bool ForceV1Zero = false, ForceV2Zero = false;
2714727147 SmallVector TargetMask(Mask.begin(), Mask.end());
2714827148 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
2714927149 BlendMask)) {
27150 // Determine a type compatible with X86ISD::BLENDI.
27151 ShuffleVT = MaskVT;
27152 if (Subtarget.hasAVX2()) {
27153 if (ShuffleVT == MVT::v4i64)
27154 ShuffleVT = MVT::v8i32;
27155 else if (ShuffleVT == MVT::v2i64)
27156 ShuffleVT = MVT::v4i32;
27150 if (MaskVT == MVT::v16i16) {
27151 // We can only use v16i16 PBLENDW if the lanes are repeated.
27152 SmallVector RepeatedMask;
27153 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
27154 RepeatedMask)) {
27155 assert(RepeatedMask.size() == 8 &&
27156 "Repeated mask size doesn't match!");
27157 PermuteImm = 0;
27158 for (int i = 0; i < 8; ++i)
27159 if (RepeatedMask[i] >= 8)
27160 PermuteImm |= 1 << i;
27161 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27162 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27163 Shuffle = X86ISD::BLENDI;
27164 ShuffleVT = MaskVT;
27165 return true;
27166 }
2715727167 } else {
27158 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
27159 ShuffleVT = MVT::v8i16;
27160 else if (ShuffleVT == MVT::v4i64)
27161 ShuffleVT = MVT::v4f64;
27162 else if (ShuffleVT == MVT::v8i32)
27163 ShuffleVT = MVT::v8f32;
27168 // Determine a type compatible with X86ISD::BLENDI.
27169 ShuffleVT = MaskVT;
27170 if (Subtarget.hasAVX2()) {
27171 if (ShuffleVT == MVT::v4i64)
27172 ShuffleVT = MVT::v8i32;
27173 else if (ShuffleVT == MVT::v2i64)
27174 ShuffleVT = MVT::v4i32;
27175 } else {
27176 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
27177 ShuffleVT = MVT::v8i16;
27178 else if (ShuffleVT == MVT::v4i64)
27179 ShuffleVT = MVT::v4f64;
27180 else if (ShuffleVT == MVT::v8i32)
27181 ShuffleVT = MVT::v8f32;
27182 }
27183
27184 if (!ShuffleVT.isFloatingPoint()) {
27185 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
27186 BlendMask =
27187 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
27188 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
27189 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
27190 }
27191
27192 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27193 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27194 PermuteImm = (unsigned)BlendMask;
27195 Shuffle = X86ISD::BLENDI;
27196 return true;
2716427197 }
27165
27166 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27167 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27168
27169 if (!ShuffleVT.isFloatingPoint()) {
27170 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
27171 BlendMask = scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
27172 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
27173 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
27174 }
27175
27176 PermuteImm = (unsigned)BlendMask;
27177 Shuffle = X86ISD::BLENDI;
27178 return true;
2717927198 }
2718027199 }
2718127200
102102 ret <4 x i32> %v3
103103 }
104104
105 ; FIXME: Missed vpblendw on AVX2 target
106105 define <8 x i32> @_clearupper8xi32a(<8 x i32>) nounwind {
107106 ; SSE-LABEL: _clearupper8xi32a:
108107 ; SSE: # BB#0:
118117 ;
119118 ; AVX2-LABEL: _clearupper8xi32a:
120119 ; AVX2: # BB#0:
121 ; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
122 ; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0
120 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
121 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
123122 ; AVX2-NEXT: retq
124123 %x0 = extractelement <8 x i32> %0, i32 0
125124 %x1 = extractelement <8 x i32> %0, i32 1
102102 ; AVX2: [[FPMASKCSTADDR_v8:.LCPI[0-9_]+]]:
103103 ; AVX2-NEXT: .long 1199570944 # float 65536
104104
105 ; AVX2: [[MASKCSTADDR_v8:.LCPI[0-9_]+]]:
106 ; AVX2-NEXT: .long 65535 # 0xffff
107
108105 define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) {
109106 ; SSE2-LABEL: test_uitofp_v8i32_to_v8f32:
110107 ; SSE2: # BB#0:
165162 ; AVX2-NEXT: vcvtdq2ps %ymm1, %ymm1
166163 ; AVX2-NEXT: vbroadcastss [[FPMASKCSTADDR_v8]](%rip), %ymm2
167164 ; AVX2-NEXT: vmulps %ymm2, %ymm1, %ymm1
168 ; AVX2-NEXT: vpbroadcastd [[MASKCSTADDR_v8]](%rip), %ymm2
169 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
165 ; AVX2-NEXT: vxorps %ymm2, %ymm2, %ymm2
166 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
170167 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
171168 ; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0
172169 ; AVX2-NEXT: retq
7373 define <32 x i8> @combine_and_pshufb(<32 x i8> %a0) {
7474 ; X32-LABEL: combine_and_pshufb:
7575 ; X32: # BB#0:
76 ; X32-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
76 ; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1
77 ; X32-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
7778 ; X32-NEXT: retl
7879 ;
7980 ; X64-LABEL: combine_and_pshufb:
8081 ; X64: # BB#0:
81 ; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
82 ; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1
83 ; X64-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
8284 ; X64-NEXT: retq
8385 %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32>
8486 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> )
8890 define <32 x i8> @combine_pshufb_and(<32 x i8> %a0) {
8991 ; X32-LABEL: combine_pshufb_and:
9092 ; X32: # BB#0:
91 ; X32-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
93 ; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1
94 ; X32-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
9295 ; X32-NEXT: retl
9396 ;
9497 ; X64-LABEL: combine_pshufb_and:
9598 ; X64: # BB#0:
96 ; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
99 ; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1
100 ; X64-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
97101 ; X64-NEXT: retq
98102 %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> )
99103 %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32>