llvm.org GIT mirror llvm / 8bfeccc
[x86] transform vector inc/dec to use -1 constant (PR33483) Convert vector increment or decrement to sub/add with an all-ones constant: add X, <1, 1...> --> sub X, <-1, -1...> sub X, <1, 1...> --> add X, <-1, -1...> The all-ones vector constant can be materialized using a pcmpeq instruction that is commonly recognized as an idiom (has no register dependency), so that's better than loading a splat 1 constant. AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better way to produce 512 one-bits. The general advantages of this lowering are: 1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables, so in theory, this could be better for perf, but... 2. That seems unlikely to affect any OOO implementation, and I can't measure any real perf difference from this transform on Haswell or Jaguar, but... 3. It doesn't look like it from the diffs, but this is an overall size win because we eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting a scalar load (which might itself be a bug), then we're replacing a scalar constant load + broadcast with a single cheap op, so that should always be smaller/better too. 4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1 and psub x, -1, so we should use that form for +1 too because we can. If there's some reason to favor a constant load on some CPU, let's make the reverse transform for all of these cases (either here in the DAG or in a later machine pass). This should fix: https://bugs.llvm.org/show_bug.cgi?id=33483 Differential Revision: https://reviews.llvm.org/D34336 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@306289 91177308-0d34-0410-b5e6-96231b3b80d8 Sanjay Patel 2 years ago
21 changed file(s) with 2288 addition(s) and 2147 deletion(s). Raw diff Collapse all Expand all
3506435064 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
3506535065 }
3506635066
35067 /// Convert vector increment or decrement to sub/add with an all-ones constant:
35068 /// add X, <1, 1...> --> sub X, <-1, -1...>
35069 /// sub X, <1, 1...> --> add X, <-1, -1...>
35070 /// The all-ones vector constant can be materialized using a pcmpeq instruction
35071 /// that is commonly recognized as an idiom (has no register dependency), so
35072 /// that's better/smaller than loading a splat 1 constant.
35073 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
35074 assert(N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB &&
35075 "Unexpected opcode for increment/decrement transform");
35076
35077 // Pseudo-legality check: getOnesVector() expects one of these types, so bail
35078 // out and wait for legalization if we have an unsupported vector length.
35079 EVT VT = N->getValueType(0);
35080 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
35081 return SDValue();
35082
35083 SDNode *N1 = N->getOperand(1).getNode();
35084 APInt SplatVal;
35085 if (!ISD::isConstantSplatVector(N1, SplatVal) || !SplatVal.isOneValue())
35086 return SDValue();
35087
35088 SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
35089 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
35090 return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
35091 }
35092
3506735093 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
3506835094 const X86Subtarget &Subtarget) {
3506935095 const SDNodeFlags Flags = N->getFlags();
3508235108 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
3508335109 isHorizontalBinOp(Op0, Op1, true))
3508435110 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
35111
35112 if (SDValue V = combineIncDecVector(N, DAG))
35113 return V;
3508535114
3508635115 return combineAddOrSubToADCOrSBB(N, DAG);
3508735116 }
3511535144 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
3511635145 isHorizontalBinOp(Op0, Op1, false))
3511735146 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
35147
35148 if (SDValue V = combineIncDecVector(N, DAG))
35149 return V;
3511835150
3511935151 return combineAddOrSubToADCOrSBB(N, DAG);
3512035152 }
8989 define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
9090 ; SSE2-LABEL: avg_v32i8:
9191 ; SSE2: # BB#0:
92 ; SSE2-NEXT: movdqa (%rdi), %xmm9
93 ; SSE2-NEXT: movdqa 16(%rdi), %xmm12
94 ; SSE2-NEXT: movdqa (%rsi), %xmm4
92 ; SSE2-NEXT: movdqa (%rdi), %xmm3
93 ; SSE2-NEXT: movdqa 16(%rdi), %xmm8
94 ; SSE2-NEXT: movdqa (%rsi), %xmm0
9595 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
96 ; SSE2-NEXT: pxor %xmm0, %xmm0
97 ; SSE2-NEXT: movdqa %xmm9, %xmm11
98 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15]
99 ; SSE2-NEXT: movdqa %xmm11, %xmm15
100 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
101 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
102 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
103 ; SSE2-NEXT: movdqa %xmm9, %xmm14
104 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
105 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
106 ; SSE2-NEXT: movdqa %xmm12, %xmm10
107 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
108 ; SSE2-NEXT: movdqa %xmm10, %xmm13
109 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
110 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
111 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
112 ; SSE2-NEXT: movdqa %xmm12, %xmm2
113 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
114 ; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
115 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
116 ; SSE2-NEXT: movdqa %xmm4, %xmm3
117 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
118 ; SSE2-NEXT: movdqa %xmm3, %xmm7
119 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
120 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
121 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
122 ; SSE2-NEXT: movdqa %xmm4, %xmm5
123 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
124 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
125 ; SSE2-NEXT: movdqa %xmm1, %xmm2
126 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
127 ; SSE2-NEXT: movdqa %xmm2, %xmm8
128 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
129 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
130 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
131 ; SSE2-NEXT: movdqa %xmm1, %xmm6
132 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
133 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
134 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
135 ; SSE2-NEXT: paddd %xmm0, %xmm7
136 ; SSE2-NEXT: paddd %xmm15, %xmm7
137 ; SSE2-NEXT: paddd %xmm0, %xmm3
138 ; SSE2-NEXT: paddd %xmm11, %xmm3
139 ; SSE2-NEXT: paddd %xmm0, %xmm5
140 ; SSE2-NEXT: paddd %xmm14, %xmm5
141 ; SSE2-NEXT: paddd %xmm0, %xmm4
142 ; SSE2-NEXT: paddd %xmm9, %xmm4
143 ; SSE2-NEXT: paddd %xmm0, %xmm8
144 ; SSE2-NEXT: paddd %xmm13, %xmm8
145 ; SSE2-NEXT: paddd %xmm0, %xmm2
146 ; SSE2-NEXT: paddd %xmm10, %xmm2
147 ; SSE2-NEXT: paddd %xmm0, %xmm6
148 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
149 ; SSE2-NEXT: paddd %xmm0, %xmm1
150 ; SSE2-NEXT: paddd %xmm12, %xmm1
96 ; SSE2-NEXT: pxor %xmm4, %xmm4
97 ; SSE2-NEXT: movdqa %xmm3, %xmm5
98 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
99 ; SSE2-NEXT: movdqa %xmm5, %xmm6
100 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
101 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
102 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
103 ; SSE2-NEXT: movdqa %xmm3, %xmm12
104 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
105 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
106 ; SSE2-NEXT: movdqa %xmm8, %xmm7
107 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
108 ; SSE2-NEXT: movdqa %xmm7, %xmm11
109 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
110 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
111 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
112 ; SSE2-NEXT: movdqa %xmm8, %xmm10
113 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
114 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
115 ; SSE2-NEXT: movdqa %xmm0, %xmm2
116 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
117 ; SSE2-NEXT: movdqa %xmm2, %xmm9
118 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
119 ; SSE2-NEXT: paddd %xmm6, %xmm9
120 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
121 ; SSE2-NEXT: paddd %xmm5, %xmm2
122 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
123 ; SSE2-NEXT: movdqa %xmm0, %xmm5
124 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
125 ; SSE2-NEXT: paddd %xmm12, %xmm5
126 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
127 ; SSE2-NEXT: paddd %xmm3, %xmm0
128 ; SSE2-NEXT: movdqa %xmm1, %xmm3
129 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
130 ; SSE2-NEXT: movdqa %xmm3, %xmm6
131 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
132 ; SSE2-NEXT: paddd %xmm11, %xmm6
133 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
134 ; SSE2-NEXT: paddd %xmm7, %xmm3
135 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
136 ; SSE2-NEXT: movdqa %xmm1, %xmm7
137 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
138 ; SSE2-NEXT: paddd %xmm10, %xmm7
139 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
140 ; SSE2-NEXT: paddd %xmm8, %xmm1
141 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
142 ; SSE2-NEXT: psubd %xmm4, %xmm9
143 ; SSE2-NEXT: psubd %xmm4, %xmm2
144 ; SSE2-NEXT: psubd %xmm4, %xmm5
145 ; SSE2-NEXT: psubd %xmm4, %xmm0
146 ; SSE2-NEXT: psubd %xmm4, %xmm6
147 ; SSE2-NEXT: psubd %xmm4, %xmm3
148 ; SSE2-NEXT: psubd %xmm4, %xmm7
149 ; SSE2-NEXT: psubd %xmm4, %xmm1
150 ; SSE2-NEXT: psrld $1, %xmm1
151 ; SSE2-NEXT: psrld $1, %xmm7
151152 ; SSE2-NEXT: psrld $1, %xmm3
152 ; SSE2-NEXT: psrld $1, %xmm7
153 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
154 ; SSE2-NEXT: pand %xmm0, %xmm7
155 ; SSE2-NEXT: pand %xmm0, %xmm3
156 ; SSE2-NEXT: packuswb %xmm7, %xmm3
157 ; SSE2-NEXT: psrld $1, %xmm4
153 ; SSE2-NEXT: psrld $1, %xmm6
154 ; SSE2-NEXT: psrld $1, %xmm0
158155 ; SSE2-NEXT: psrld $1, %xmm5
159 ; SSE2-NEXT: pand %xmm0, %xmm5
160 ; SSE2-NEXT: pand %xmm0, %xmm4
161 ; SSE2-NEXT: packuswb %xmm5, %xmm4
162 ; SSE2-NEXT: packuswb %xmm3, %xmm4
163156 ; SSE2-NEXT: psrld $1, %xmm2
164 ; SSE2-NEXT: psrld $1, %xmm8
165 ; SSE2-NEXT: pand %xmm0, %xmm8
166 ; SSE2-NEXT: pand %xmm0, %xmm2
167 ; SSE2-NEXT: packuswb %xmm8, %xmm2
168 ; SSE2-NEXT: psrld $1, %xmm1
169 ; SSE2-NEXT: psrld $1, %xmm6
170 ; SSE2-NEXT: pand %xmm0, %xmm6
171 ; SSE2-NEXT: pand %xmm0, %xmm1
172 ; SSE2-NEXT: packuswb %xmm6, %xmm1
173 ; SSE2-NEXT: packuswb %xmm2, %xmm1
157 ; SSE2-NEXT: psrld $1, %xmm9
158 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
159 ; SSE2-NEXT: pand %xmm4, %xmm9
160 ; SSE2-NEXT: pand %xmm4, %xmm2
161 ; SSE2-NEXT: packuswb %xmm9, %xmm2
162 ; SSE2-NEXT: pand %xmm4, %xmm5
163 ; SSE2-NEXT: pand %xmm4, %xmm0
164 ; SSE2-NEXT: packuswb %xmm5, %xmm0
165 ; SSE2-NEXT: packuswb %xmm2, %xmm0
166 ; SSE2-NEXT: pand %xmm4, %xmm6
167 ; SSE2-NEXT: pand %xmm4, %xmm3
168 ; SSE2-NEXT: packuswb %xmm6, %xmm3
169 ; SSE2-NEXT: pand %xmm4, %xmm7
170 ; SSE2-NEXT: pand %xmm4, %xmm1
171 ; SSE2-NEXT: packuswb %xmm7, %xmm1
172 ; SSE2-NEXT: packuswb %xmm3, %xmm1
174173 ; SSE2-NEXT: movdqu %xmm1, (%rax)
175 ; SSE2-NEXT: movdqu %xmm4, (%rax)
174 ; SSE2-NEXT: movdqu %xmm0, (%rax)
176175 ; SSE2-NEXT: retq
177176 ;
178177 ; AVX1-LABEL: avg_v32i8:
183182 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
184183 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
185184 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
185 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
186186 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
187187 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
188 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1]
189 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
190 ; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm0
188 ; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm9
191189 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
192 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
193190 ; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm1
194191 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
195 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
196 ; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm9
192 ; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
197193 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
198 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
199194 ; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3
200195 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
201 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
202196 ; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
203197 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
204 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
205198 ; AVX1-NEXT: vpaddd %xmm7, %xmm5, %xmm5
206199 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
207 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
200 ; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
201 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
208202 ; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7
209 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
210 ; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2
211 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
212 ; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2
203 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
204 ; AVX1-NEXT: vpsubd %xmm0, %xmm9, %xmm8
205 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
206 ; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm2
207 ; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3
208 ; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm4
209 ; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm5
210 ; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm6
211 ; AVX1-NEXT: vpsubd %xmm0, %xmm7, %xmm0
212 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm9
213 ; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6
214 ; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5
215 ; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
216 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
217 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
213218 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
214 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
215 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
216 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
217 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
218 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
219 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm1
220 ; AVX1-NEXT: vpsrld $1, %xmm9, %xmm3
221 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
222 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
223 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
224 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
225 ; AVX1-NEXT: vpsrld $1, %xmm5, %xmm1
226 ; AVX1-NEXT: vpsrld $1, %xmm4, %xmm3
227 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
228 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
229 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
230 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
231 ; AVX1-NEXT: vpsrld $1, %xmm7, %xmm3
232 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
233 ; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
234 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
219 ; AVX1-NEXT: vpsrld $1, %xmm8, %xmm7
220 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
221 ; AVX1-NEXT: vpand %xmm0, %xmm7, %xmm7
222 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm1
223 ; AVX1-NEXT: vpackuswb %xmm7, %xmm1, %xmm1
224 ; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm2
225 ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm3
226 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
235227 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
236 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
228 ; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm2
229 ; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm3
230 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
231 ; AVX1-NEXT: vpand %xmm0, %xmm6, %xmm3
232 ; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0
233 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
234 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
235 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
237236 ; AVX1-NEXT: vmovups %ymm0, (%rax)
238237 ; AVX1-NEXT: vzeroupper
239238 ; AVX1-NEXT: retq
268267 define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
269268 ; SSE2-LABEL: avg_v64i8:
270269 ; SSE2: # BB#0:
271 ; SSE2-NEXT: subq $152, %rsp
272 ; SSE2-NEXT: .Lcfi0:
273 ; SSE2-NEXT: .cfi_def_cfa_offset 160
274 ; SSE2-NEXT: movdqa (%rdi), %xmm4
275 ; SSE2-NEXT: movdqa 16(%rdi), %xmm3
276 ; SSE2-NEXT: movdqa 32(%rdi), %xmm2
277 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1
270 ; SSE2-NEXT: movdqa (%rdi), %xmm6
271 ; SSE2-NEXT: movdqa 16(%rdi), %xmm2
272 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
273 ; SSE2-NEXT: movdqa 48(%rdi), %xmm0
274 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
275 ; SSE2-NEXT: movdqa (%rsi), %xmm5
276 ; SSE2-NEXT: movdqa 16(%rsi), %xmm13
277 ; SSE2-NEXT: movdqa 32(%rsi), %xmm11
278278 ; SSE2-NEXT: pxor %xmm0, %xmm0
279 ; SSE2-NEXT: movdqa %xmm4, %xmm5
280 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
281 ; SSE2-NEXT: movdqa %xmm5, %xmm6
282 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
283 ; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
279 ; SSE2-NEXT: movdqa %xmm6, %xmm4
280 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
281 ; SSE2-NEXT: movdqa %xmm4, %xmm7
282 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
283 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
284 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
285 ; SSE2-NEXT: movdqa %xmm6, %xmm12
286 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
287 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
288 ; SSE2-NEXT: movdqa %xmm2, %xmm15
289 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15]
290 ; SSE2-NEXT: movdqa %xmm15, %xmm14
291 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
292 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
293 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
294 ; SSE2-NEXT: movdqa %xmm2, %xmm8
295 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
296 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
297 ; SSE2-NEXT: movdqa %xmm5, %xmm10
298 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
299 ; SSE2-NEXT: movdqa %xmm10, %xmm3
300 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
301 ; SSE2-NEXT: paddd %xmm7, %xmm3
302 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
303 ; SSE2-NEXT: movdqa %xmm1, %xmm7
304 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
305 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
306 ; SSE2-NEXT: paddd %xmm4, %xmm10
307 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
308 ; SSE2-NEXT: movdqa %xmm5, %xmm3
309 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
310 ; SSE2-NEXT: paddd %xmm12, %xmm3
311 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
284312 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
313 ; SSE2-NEXT: paddd %xmm6, %xmm5
285314 ; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
286 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
287 ; SSE2-NEXT: movdqa %xmm4, %xmm5
315 ; SSE2-NEXT: movdqa %xmm13, %xmm4
316 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
317 ; SSE2-NEXT: movdqa %xmm4, %xmm12
318 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
319 ; SSE2-NEXT: paddd %xmm14, %xmm12
320 ; SSE2-NEXT: movdqa %xmm7, %xmm5
288321 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
289 ; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
322 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
323 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
290324 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
291 ; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
292 ; SSE2-NEXT: movdqa %xmm3, %xmm4
293 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
294 ; SSE2-NEXT: movdqa %xmm4, %xmm5
295 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
296 ; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
297 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
298 ; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
299 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
300 ; SSE2-NEXT: movdqa %xmm3, %xmm4
301 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
302 ; SSE2-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill
303 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
304 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
305 ; SSE2-NEXT: movdqa %xmm2, %xmm3
306 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
307 ; SSE2-NEXT: movdqa %xmm3, %xmm4
308 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
309 ; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
310 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
311 ; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
312 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
313 ; SSE2-NEXT: movdqa %xmm2, %xmm3
314 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
315 ; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
316 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
317 ; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
318 ; SSE2-NEXT: movdqa %xmm1, %xmm2
319 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
320 ; SSE2-NEXT: movdqa %xmm2, %xmm3
321 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
322 ; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
323 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
324 ; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
325 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
325 ; SSE2-NEXT: paddd %xmm15, %xmm4
326 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
327 ; SSE2-NEXT: movdqa %xmm13, %xmm15
328 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
329 ; SSE2-NEXT: paddd %xmm8, %xmm15
330 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
331 ; SSE2-NEXT: paddd %xmm2, %xmm13
332 ; SSE2-NEXT: movdqa %xmm11, %xmm6
333 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
334 ; SSE2-NEXT: movdqa %xmm6, %xmm9
335 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
336 ; SSE2-NEXT: paddd %xmm5, %xmm9
326337 ; SSE2-NEXT: movdqa %xmm1, %xmm2
327338 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
328 ; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
329339 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
330 ; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
331 ; SSE2-NEXT: movdqa (%rsi), %xmm10
332 ; SSE2-NEXT: movdqa %xmm10, %xmm4
333 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
334 ; SSE2-NEXT: movdqa %xmm4, %xmm11
335 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
336 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
337 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3],xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
338 ; SSE2-NEXT: movdqa %xmm10, %xmm12
339 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
340 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
341 ; SSE2-NEXT: movdqa 16(%rsi), %xmm15
342 ; SSE2-NEXT: movdqa %xmm15, %xmm7
343 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
344 ; SSE2-NEXT: movdqa %xmm7, %xmm14
340 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
341 ; SSE2-NEXT: paddd %xmm7, %xmm6
342 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
343 ; SSE2-NEXT: movdqa %xmm11, %xmm14
345344 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
346 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
347 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
348 ; SSE2-NEXT: movdqa %xmm15, %xmm6
349 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
350 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
351 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2
352 ; SSE2-NEXT: movdqa %xmm2, %xmm8
353 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
354 ; SSE2-NEXT: movdqa %xmm8, %xmm3
355 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
356 ; SSE2-NEXT: movdqa %xmm3, %xmm13
357 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
345 ; SSE2-NEXT: paddd %xmm2, %xmm14
346 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
347 ; SSE2-NEXT: movdqa %xmm5, %xmm2
348 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
349 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
350 ; SSE2-NEXT: paddd %xmm1, %xmm11
351 ; SSE2-NEXT: movdqa %xmm2, %xmm1
352 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
353 ; SSE2-NEXT: movdqa 48(%rsi), %xmm7
354 ; SSE2-NEXT: movdqa %xmm7, %xmm3
355 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
356 ; SSE2-NEXT: movdqa %xmm3, %xmm8
357 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
358 ; SSE2-NEXT: paddd %xmm1, %xmm8
359 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
360 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
361 ; SSE2-NEXT: paddd %xmm2, %xmm3
362 ; SSE2-NEXT: movdqa %xmm5, %xmm2
358363 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
359364 ; SSE2-NEXT: movdqa %xmm2, %xmm1
360365 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
366 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
367 ; SSE2-NEXT: movdqa %xmm7, %xmm5
368 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
369 ; SSE2-NEXT: paddd %xmm1, %xmm5
370 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
371 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
372 ; SSE2-NEXT: paddd %xmm2, %xmm7
373 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
374 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
375 ; SSE2-NEXT: psubd %xmm0, %xmm1
361376 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
362 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
363 ; SSE2-NEXT: movdqa 48(%rsi), %xmm1
364 ; SSE2-NEXT: movdqa %xmm1, %xmm9
365 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15]
366 ; SSE2-NEXT: movdqa %xmm9, %xmm5
367 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
368 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
369 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
370 ; SSE2-NEXT: movdqa %xmm1, %xmm3
371 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
372 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
373 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
374 ; SSE2-NEXT: paddd %xmm0, %xmm11
375 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm11 # 16-byte Folded Reload
376 ; SSE2-NEXT: paddd %xmm0, %xmm4
377 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload
378 ; SSE2-NEXT: paddd %xmm0, %xmm12
379 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload
380 ; SSE2-NEXT: paddd %xmm0, %xmm10
381 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload
382 ; SSE2-NEXT: paddd %xmm0, %xmm14
383 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm14 # 16-byte Folded Reload
384 ; SSE2-NEXT: paddd %xmm0, %xmm7
385 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
386 ; SSE2-NEXT: paddd %xmm0, %xmm6
387 ; SSE2-NEXT: paddd (%rsp), %xmm6 # 16-byte Folded Reload
388 ; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
389 ; SSE2-NEXT: paddd %xmm0, %xmm15
390 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload
391 ; SSE2-NEXT: paddd %xmm0, %xmm13
392 ; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload
393 ; SSE2-NEXT: movdqa %xmm13, -{{[0-9]+}}(%rsp) # 16-byte Spill
394 ; SSE2-NEXT: paddd %xmm0, %xmm8
395 ; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
396 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
397 ; SSE2-NEXT: paddd %xmm0, %xmm6
398 ; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
399 ; SSE2-NEXT: paddd %xmm0, %xmm2
400 ; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
401 ; SSE2-NEXT: paddd %xmm0, %xmm5
402 ; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
403 ; SSE2-NEXT: paddd %xmm0, %xmm9
404 ; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload
405 ; SSE2-NEXT: paddd %xmm0, %xmm3
406 ; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
407 ; SSE2-NEXT: movdqa %xmm3, %xmm13
408 ; SSE2-NEXT: paddd %xmm0, %xmm1
409 ; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
377 ; SSE2-NEXT: psubd %xmm0, %xmm10
378 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
379 ; SSE2-NEXT: psubd %xmm0, %xmm1
380 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
381 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
382 ; SSE2-NEXT: psubd %xmm0, %xmm2
383 ; SSE2-NEXT: psubd %xmm0, %xmm12
384 ; SSE2-NEXT: psubd %xmm0, %xmm4
385 ; SSE2-NEXT: psubd %xmm0, %xmm15
386 ; SSE2-NEXT: psubd %xmm0, %xmm13
387 ; SSE2-NEXT: psubd %xmm0, %xmm9
388 ; SSE2-NEXT: psubd %xmm0, %xmm6
389 ; SSE2-NEXT: psubd %xmm0, %xmm14
390 ; SSE2-NEXT: psubd %xmm0, %xmm11
391 ; SSE2-NEXT: psubd %xmm0, %xmm8
392 ; SSE2-NEXT: psubd %xmm0, %xmm3
393 ; SSE2-NEXT: psubd %xmm0, %xmm5
394 ; SSE2-NEXT: psubd %xmm0, %xmm7
395 ; SSE2-NEXT: psrld $1, %xmm10
396 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
397 ; SSE2-NEXT: psrld $1, %xmm1
398 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
399 ; SSE2-NEXT: pand %xmm0, %xmm1
400 ; SSE2-NEXT: pand %xmm0, %xmm10
401 ; SSE2-NEXT: packuswb %xmm1, %xmm10
402 ; SSE2-NEXT: psrld $1, %xmm2
403 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
404 ; SSE2-NEXT: psrld $1, %xmm1
405 ; SSE2-NEXT: pand %xmm0, %xmm1
406 ; SSE2-NEXT: pand %xmm0, %xmm2
407 ; SSE2-NEXT: packuswb %xmm1, %xmm2
408 ; SSE2-NEXT: packuswb %xmm10, %xmm2
409 ; SSE2-NEXT: movdqa %xmm2, %xmm1
410410 ; SSE2-NEXT: psrld $1, %xmm4
411 ; SSE2-NEXT: psrld $1, %xmm11
412 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
413 ; SSE2-NEXT: pand %xmm0, %xmm11
414 ; SSE2-NEXT: pand %xmm0, %xmm4
415 ; SSE2-NEXT: packuswb %xmm11, %xmm4
416 ; SSE2-NEXT: psrld $1, %xmm10
417411 ; SSE2-NEXT: psrld $1, %xmm12
418412 ; SSE2-NEXT: pand %xmm0, %xmm12
419 ; SSE2-NEXT: pand %xmm0, %xmm10
420 ; SSE2-NEXT: packuswb %xmm12, %xmm10
421 ; SSE2-NEXT: packuswb %xmm4, %xmm10
422 ; SSE2-NEXT: psrld $1, %xmm7
413 ; SSE2-NEXT: pand %xmm0, %xmm4
414 ; SSE2-NEXT: packuswb %xmm12, %xmm4
415 ; SSE2-NEXT: psrld $1, %xmm13
416 ; SSE2-NEXT: psrld $1, %xmm15
417 ; SSE2-NEXT: pand %xmm0, %xmm15
418 ; SSE2-NEXT: pand %xmm0, %xmm13
419 ; SSE2-NEXT: packuswb %xmm15, %xmm13
420 ; SSE2-NEXT: packuswb %xmm4, %xmm13
421 ; SSE2-NEXT: psrld $1, %xmm6
422 ; SSE2-NEXT: psrld $1, %xmm9
423 ; SSE2-NEXT: pand %xmm0, %xmm9
424 ; SSE2-NEXT: pand %xmm0, %xmm6
425 ; SSE2-NEXT: packuswb %xmm9, %xmm6
426 ; SSE2-NEXT: psrld $1, %xmm11
423427 ; SSE2-NEXT: psrld $1, %xmm14
424428 ; SSE2-NEXT: pand %xmm0, %xmm14
429 ; SSE2-NEXT: pand %xmm0, %xmm11
430 ; SSE2-NEXT: packuswb %xmm14, %xmm11
431 ; SSE2-NEXT: packuswb %xmm6, %xmm11
432 ; SSE2-NEXT: psrld $1, %xmm3
433 ; SSE2-NEXT: psrld $1, %xmm8
434 ; SSE2-NEXT: pand %xmm0, %xmm8
435 ; SSE2-NEXT: pand %xmm0, %xmm3
436 ; SSE2-NEXT: packuswb %xmm8, %xmm3
437 ; SSE2-NEXT: psrld $1, %xmm7
438 ; SSE2-NEXT: psrld $1, %xmm5
439 ; SSE2-NEXT: pand %xmm0, %xmm5
425440 ; SSE2-NEXT: pand %xmm0, %xmm7
426 ; SSE2-NEXT: packuswb %xmm14, %xmm7
427 ; SSE2-NEXT: psrld $1, %xmm15
428 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
429 ; SSE2-NEXT: psrld $1, %xmm3
430 ; SSE2-NEXT: pand %xmm0, %xmm3
431 ; SSE2-NEXT: pand %xmm0, %xmm15
432 ; SSE2-NEXT: packuswb %xmm3, %xmm15
433 ; SSE2-NEXT: packuswb %xmm7, %xmm15
434 ; SSE2-NEXT: psrld $1, %xmm8
435 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
436 ; SSE2-NEXT: psrld $1, %xmm4
437 ; SSE2-NEXT: pand %xmm0, %xmm4
438 ; SSE2-NEXT: pand %xmm0, %xmm8
439 ; SSE2-NEXT: packuswb %xmm4, %xmm8
440 ; SSE2-NEXT: psrld $1, %xmm2
441 ; SSE2-NEXT: movdqa %xmm6, %xmm3
442 ; SSE2-NEXT: psrld $1, %xmm3
443 ; SSE2-NEXT: pand %xmm0, %xmm3
444 ; SSE2-NEXT: pand %xmm0, %xmm2
445 ; SSE2-NEXT: packuswb %xmm3, %xmm2
446 ; SSE2-NEXT: packuswb %xmm8, %xmm2
447 ; SSE2-NEXT: psrld $1, %xmm9
448 ; SSE2-NEXT: movdqa %xmm5, %xmm4
449 ; SSE2-NEXT: psrld $1, %xmm4
450 ; SSE2-NEXT: pand %xmm0, %xmm4
451 ; SSE2-NEXT: pand %xmm0, %xmm9
452 ; SSE2-NEXT: packuswb %xmm4, %xmm9
453 ; SSE2-NEXT: psrld $1, %xmm1
454 ; SSE2-NEXT: psrld $1, %xmm13
455 ; SSE2-NEXT: pand %xmm0, %xmm13
456 ; SSE2-NEXT: pand %xmm0, %xmm1
457 ; SSE2-NEXT: packuswb %xmm13, %xmm1
458 ; SSE2-NEXT: packuswb %xmm9, %xmm1
441 ; SSE2-NEXT: packuswb %xmm5, %xmm7
442 ; SSE2-NEXT: packuswb %xmm3, %xmm7
443 ; SSE2-NEXT: movdqu %xmm7, (%rax)
444 ; SSE2-NEXT: movdqu %xmm11, (%rax)
445 ; SSE2-NEXT: movdqu %xmm13, (%rax)
459446 ; SSE2-NEXT: movdqu %xmm1, (%rax)
460 ; SSE2-NEXT: movdqu %xmm2, (%rax)
461 ; SSE2-NEXT: movdqu %xmm15, (%rax)
462 ; SSE2-NEXT: movdqu %xmm10, (%rax)
463 ; SSE2-NEXT: addq $152, %rsp
464447 ; SSE2-NEXT: retq
465448 ;
466449 ; AVX1-LABEL: avg_v64i8:
467450 ; AVX1: # BB#0:
451 ; AVX1-NEXT: subq $24, %rsp
452 ; AVX1-NEXT: .Lcfi0:
453 ; AVX1-NEXT: .cfi_def_cfa_offset 32
454 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
455 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
456 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
457 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
458 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
468459 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
460 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
461 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
462 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
463 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
464 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
469465 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
470 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
471 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
472 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
473 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
474 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
475 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
476 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
477 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
478 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
479 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
480 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
466 ; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
467 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
468 ; AVX1-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill
469 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
470 ; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
471 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
472 ; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
473 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
474 ; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm0
481475 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
482 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
476 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
477 ; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm0
478 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
479 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
480 ; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm0
481 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
482 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
483 ; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm0
484 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
485 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
486 ; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm0
483487 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
484488 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
485 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,1,1,1]
486 ; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm4
487 ; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm14
489 ; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm13
488490 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
489 ; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm4
490 ; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
491 ; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm12
491492 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
492 ; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm4
493 ; AVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm4
494 ; AVX1-NEXT: vmovdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
495 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
496 ; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm4
497 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm9
498 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
499 ; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm4
500 ; AVX1-NEXT: vpaddd %xmm4, %xmm11, %xmm11
501 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
502 ; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm4
503 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
504 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
505 ; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm4
506 ; AVX1-NEXT: vpaddd %xmm4, %xmm15, %xmm15
507 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
508 ; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm4
509 ; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6
510 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
511 ; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm4
512 ; AVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm2
493 ; AVX1-NEXT: vpaddd %xmm4, %xmm15, %xmm11
494 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
495 ; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm10
496 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
497 ; AVX1-NEXT: vpaddd %xmm1, %xmm9, %xmm8
498 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
499 ; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm9
500 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
501 ; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm3, %xmm4 # 16-byte Folded Reload
502 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
503 ; AVX1-NEXT: vpaddd (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload
504 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
505 ; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm5, %xmm3 # 16-byte Folded Reload
506 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
507 ; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm5, %xmm2 # 16-byte Folded Reload
508 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
509 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
510 ; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm1
511 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
512 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
513 ; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm14
514 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
515 ; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm5
516 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
517 ; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm6
518 ; AVX1-NEXT: vmovdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
519 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
520 ; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm6
521 ; AVX1-NEXT: vmovdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
522 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
523 ; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm15
524 ; AVX1-NEXT: vmovdqa %xmm15, -{{[0-9]+}}(%rsp) # 16-byte Spill
525 ; AVX1-NEXT: vpsubd %xmm0, %xmm13, %xmm13
526 ; AVX1-NEXT: vpsubd %xmm0, %xmm12, %xmm12
527 ; AVX1-NEXT: vpsubd %xmm0, %xmm11, %xmm11
528 ; AVX1-NEXT: vpsubd %xmm0, %xmm10, %xmm10
529 ; AVX1-NEXT: vpsubd %xmm0, %xmm8, %xmm8
530 ; AVX1-NEXT: vpsubd %xmm0, %xmm9, %xmm9
531 ; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm4
532 ; AVX1-NEXT: vpsubd %xmm0, %xmm7, %xmm7
533 ; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
534 ; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3
535 ; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm2
513536 ; AVX1-NEXT: vmovdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
514 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
515 ; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm4
516 ; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm8
517 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
518 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
519 ; AVX1-NEXT: vpaddd %xmm1, %xmm12, %xmm12
520 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
521 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
522 ; AVX1-NEXT: vpaddd %xmm1, %xmm13, %xmm4
523 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
524 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
525 ; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm1, %xmm13 # 16-byte Folded Reload
526 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
527 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
528 ; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm1, %xmm5 # 16-byte Folded Reload
529 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
530 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
531 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
532 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm10
533 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
534 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
535 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
536 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
537 ; AVX1-NEXT: vpsrld $1, %xmm7, %xmm0
537 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0
538 ; AVX1-NEXT: vpsrld $1, %xmm5, %xmm1
538539 ; AVX1-NEXT: vpsrld $1, %xmm14, %xmm14
539 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
540 ; AVX1-NEXT: vpand %xmm7, %xmm14, %xmm14
541 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
542 ; AVX1-NEXT: vpackuswb %xmm14, %xmm0, %xmm14
543 ; AVX1-NEXT: vpsrld $1, %xmm9, %xmm0
540 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
541 ; AVX1-NEXT: vpand %xmm5, %xmm14, %xmm14
542 ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
543 ; AVX1-NEXT: vpackuswb %xmm14, %xmm1, %xmm1
544 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
545 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm6
544546 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
545547 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
546 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
547 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
548 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
549 ; AVX1-NEXT: vpackuswb %xmm14, %xmm0, %xmm0
550 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm2
551 ; AVX1-NEXT: vpsrld $1, %xmm11, %xmm3
552 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
553 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
554 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
555 ; AVX1-NEXT: vpsrld $1, %xmm6, %xmm3
556 ; AVX1-NEXT: vpsrld $1, %xmm15, %xmm6
557 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
558 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
559 ; AVX1-NEXT: vpackuswb %xmm6, %xmm3, %xmm3
560 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
561 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
548 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
549 ; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
550 ; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2
551 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
552 ; AVX1-NEXT: vpsrld $1, %xmm13, %xmm2
553 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
554 ; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6
555 ; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
556 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
557 ; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
558 ; AVX1-NEXT: vpsrld $1, %xmm11, %xmm6
559 ; AVX1-NEXT: vpsrld $1, %xmm12, %xmm7
560 ; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm7
561 ; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
562 ; AVX1-NEXT: vpackuswb %xmm7, %xmm6, %xmm6
563 ; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2
564 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
562565 ; AVX1-NEXT: vpsrld $1, %xmm8, %xmm2
563 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
566 ; AVX1-NEXT: vpsrld $1, %xmm10, %xmm6
567 ; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
568 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
569 ; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
570 ; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
571 ; AVX1-NEXT: vpsrld $1, %xmm9, %xmm6
572 ; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
573 ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
574 ; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4
575 ; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm2
564576 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
565 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
566 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
567 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
568 ; AVX1-NEXT: vpsrld $1, %xmm4, %xmm3
569 ; AVX1-NEXT: vpsrld $1, %xmm12, %xmm4
570 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4
571 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
577 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
578 ; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
579 ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
580 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
572581 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
573 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
574 ; AVX1-NEXT: vpsrld $1, %xmm5, %xmm3
575 ; AVX1-NEXT: vpsrld $1, %xmm13, %xmm4
576 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4
577 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
578 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
579 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
580 ; AVX1-NEXT: vpsrld $1, %xmm10, %xmm4
581 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4
582 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
583 ; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1
584 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
585 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
582 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
583 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
584 ; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
585 ; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
586 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
587 ; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
588 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
589 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
590 ; AVX1-NEXT: vmovups %ymm0, (%rax)
586591 ; AVX1-NEXT: vmovups %ymm1, (%rax)
587 ; AVX1-NEXT: vmovups %ymm0, (%rax)
592 ; AVX1-NEXT: addq $24, %rsp
588593 ; AVX1-NEXT: vzeroupper
589594 ; AVX1-NEXT: retq
590595 ;
599604 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
600605 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
601606 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
602 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm9
603 ; AVX2-NEXT: vpaddd %ymm9, %ymm8, %ymm8
604 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm8
605 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
606 ; AVX2-NEXT: vpaddd %ymm9, %ymm0, %ymm0
607 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm10
608 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
609 ; AVX2-NEXT: vpaddd %ymm9, %ymm0, %ymm0
610 ; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm11
611 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
612 ; AVX2-NEXT: vpaddd %ymm9, %ymm0, %ymm0
613 ; AVX2-NEXT: vpaddd %ymm0, %ymm3, %ymm12
614 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
615 ; AVX2-NEXT: vpaddd %ymm9, %ymm0, %ymm0
616 ; AVX2-NEXT: vpaddd %ymm0, %ymm4, %ymm2
617 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
618 ; AVX2-NEXT: vpaddd %ymm9, %ymm0, %ymm0
619 ; AVX2-NEXT: vpaddd %ymm0, %ymm5, %ymm4
620 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
621 ; AVX2-NEXT: vpaddd %ymm9, %ymm0, %ymm0
622 ; AVX2-NEXT: vpaddd %ymm0, %ymm6, %ymm13
623 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
624 ; AVX2-NEXT: vpaddd %ymm9, %ymm1, %ymm1
625 ; AVX2-NEXT: vpaddd %ymm1, %ymm7, %ymm1
626 ; AVX2-NEXT: vpsrld $1, %ymm10, %ymm6
627 ; AVX2-NEXT: vpsrld $1, %ymm8, %ymm5
628 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
629 ; AVX2-NEXT: vpshufb %ymm3, %ymm5, %ymm5
630 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,2,3]
631 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
632 ; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm7
633 ; AVX2-NEXT: vpshufb %ymm3, %ymm6, %ymm6
634 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
635 ; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm6
636 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
637 ; AVX2-NEXT: vpsrld $1, %ymm12, %ymm7
638 ; AVX2-NEXT: vpsrld $1, %ymm11, %ymm8
639 ; AVX2-NEXT: vpshufb %ymm3, %ymm8, %ymm8
640 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
641 ; AVX2-NEXT: vpshufb %xmm5, %xmm8, %xmm0
642 ; AVX2-NEXT: vpshufb %ymm3, %ymm7, %ymm7
643 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
644 ; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm7
645 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0]
646 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
647 ; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
648 ; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
649 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
650 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
651 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
652 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm4
653 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
654 ; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
655 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
656 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
657 ; AVX2-NEXT: vpsrld $1, %ymm13, %ymm4
658 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm4
659 ; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
660 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm4[0,2,2,3]
661 ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
662 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
663 ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
664 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
665 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
666 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
667 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
668 ; AVX2-NEXT: vzeroupper
669 ; AVX2-NEXT: retq
670 ;
671 ; AVX512F-LABEL: avg_v64i8:
672 ; AVX512F: # BB#0:
673 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
674 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
675 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
676 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
677 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
678 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
679 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
680 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
681 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm8
682 ; AVX512F-NEXT: vpaddd %zmm8, %zmm4, %zmm4
683 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0
684 ; AVX512F-NEXT: vpaddd %zmm8, %zmm5, %zmm4
685 ; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1
686 ; AVX512F-NEXT: vpaddd %zmm8, %zmm6, %zmm4
687 ; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2
688 ; AVX512F-NEXT: vpaddd %zmm8, %zmm7, %zmm4
689 ; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3
690 ; AVX512F-NEXT: vpsrld $1, %zmm3, %zmm3
691 ; AVX512F-NEXT: vpsrld $1, %zmm2, %zmm2
692 ; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1
693 ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0
694 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
695 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
696 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
697 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm1
698 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm2
699 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
700 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
701 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
702 ; AVX512F-NEXT: vzeroupper
703 ; AVX512F-NEXT: retq
704 ;
705 ; AVX512BW-LABEL: avg_v64i8:
706 ; AVX512BW: # BB#0:
707 ; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
708 ; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
709 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
710 ; AVX512BW-NEXT: vzeroupper
711 ; AVX512BW-NEXT: retq
712 %1 = load <64 x i8>, <64 x i8>* %a
713 %2 = load <64 x i8>, <64 x i8>* %b
714 %3 = zext <64 x i8> %1 to <64 x i32>
715 %4 = zext <64 x i8> %2 to <64 x i32>
716 %5 = add nuw nsw <64 x i32> %3,
717 %6 = add nuw nsw <64 x i32> %5, %4
718 %7 = lshr <64 x i32> %6,
719 %8 = trunc <64 x i32> %7 to <64 x i8>
720 store <64 x i8> %8, <64 x i8>* undef, align 4
721 ret void
722 }
723
724 define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) {
725 ; SSE2-LABEL: avg_v4i16:
726 ; SSE2: # BB#0:
727 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
728 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
729 ; SSE2-NEXT: pavgw %xmm0, %xmm1
730 ; SSE2-NEXT: movq %xmm1, (%rax)
731 ; SSE2-NEXT: retq
732 ;
733 ; AVX-LABEL: avg_v4i16:
734 ; AVX: # BB#0:
735 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
736 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
737 ; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0
738 ; AVX-NEXT: vmovq %xmm0, (%rax)
739 ; AVX-NEXT: retq
740 %1 = load <4 x i16>, <4 x i16>* %a
741 %2 = load <4 x i16>, <4 x i16>* %b
742 %3 = zext <4 x i16> %1 to <4 x i32>
743 %4 = zext <4 x i16> %2 to <4 x i32>
744 %5 = add nuw nsw <4 x i32> %3,
745 %6 = add nuw nsw <4 x i32> %5, %4
746 %7 = lshr <4 x i32> %6,
747 %8 = trunc <4 x i32> %7 to <4 x i16>
748 store <4 x i16> %8, <4 x i16>* undef, align 4
749 ret void
750 }
751
752 define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
753 ; SSE2-LABEL: avg_v8i16:
754 ; SSE2: # BB#0:
755 ; SSE2-NEXT: movdqa (%rsi), %xmm0
756 ; SSE2-NEXT: pavgw (%rdi), %xmm0
757 ; SSE2-NEXT: movdqu %xmm0, (%rax)
758 ; SSE2-NEXT: retq
759 ;
760 ; AVX-LABEL: avg_v8i16:
761 ; AVX: # BB#0:
762 ; AVX-NEXT: vmovdqa (%rsi), %xmm0
763 ; AVX-NEXT: vpavgw (%rdi), %xmm0, %xmm0
764 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
765 ; AVX-NEXT: retq
766 %1 = load <8 x i16>, <8 x i16>* %a
767 %2 = load <8 x i16>, <8 x i16>* %b
768 %3 = zext <8 x i16> %1 to <8 x i32>
769 %4 = zext <8 x i16> %2 to <8 x i32>
770 %5 = add nuw nsw <8 x i32> %3,
771 %6 = add nuw nsw <8 x i32> %5, %4
772 %7 = lshr <8 x i32> %6,
773 %8 = trunc <8 x i32> %7 to <8 x i16>
774 store <8 x i16> %8, <8 x i16>* undef, align 4
775 ret void
776 }
777
778 define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
779 ; SSE2-LABEL: avg_v16i16:
780 ; SSE2: # BB#0:
781 ; SSE2-NEXT: movdqa (%rdi), %xmm5
782 ; SSE2-NEXT: movdqa 16(%rdi), %xmm4
783 ; SSE2-NEXT: movdqa (%rsi), %xmm0
784 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
785 ; SSE2-NEXT: pxor %xmm6, %xmm6
786 ; SSE2-NEXT: movdqa %xmm5, %xmm7
787 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
788 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
789 ; SSE2-NEXT: movdqa %xmm4, %xmm8
790 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
791 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
792 ; SSE2-NEXT: movdqa %xmm0, %xmm3
793 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
794 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
795 ; SSE2-NEXT: movdqa %xmm1, %xmm2
796 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
797 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
798 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1]
799 ; SSE2-NEXT: paddd %xmm6, %xmm3
800 ; SSE2-NEXT: paddd %xmm7, %xmm3
801 ; SSE2-NEXT: paddd %xmm6, %xmm0
802 ; SSE2-NEXT: paddd %xmm5, %xmm0
803 ; SSE2-NEXT: paddd %xmm6, %xmm2
804 ; SSE2-NEXT: paddd %xmm8, %xmm2
805 ; SSE2-NEXT: paddd %xmm6, %xmm1
806 ; SSE2-NEXT: paddd %xmm4, %xmm1
807 ; SSE2-NEXT: psrld $1, %xmm1
808 ; SSE2-NEXT: psrld $1, %xmm2
809 ; SSE2-NEXT: psrld $1, %xmm0
810 ; SSE2-NEXT: psrld $1, %xmm3
811 ; SSE2-NEXT: pslld $16, %xmm3
812 ; SSE2-NEXT: psrad $16, %xmm3
813 ; SSE2-NEXT: pslld $16, %xmm0
814 ; SSE2-NEXT: psrad $16, %xmm0
815 ; SSE2-NEXT: packssdw %xmm3, %xmm0
816 ; SSE2-NEXT: pslld $16, %xmm2
817 ; SSE2-NEXT: psrad $16, %xmm2
818 ; SSE2-NEXT: pslld $16, %xmm1
819 ; SSE2-NEXT: psrad $16, %xmm1
820 ; SSE2-NEXT: packssdw %xmm2, %xmm1
821 ; SSE2-NEXT: movdqu %xmm1, (%rax)
822 ; SSE2-NEXT: movdqu %xmm0, (%rax)
823 ; SSE2-NEXT: retq
824 ;
825 ; AVX1-LABEL: avg_v16i16:
826 ; AVX1: # BB#0:
827 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
828 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
829 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
830 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
831 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
832 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
833 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
834 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
835 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
836 ; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4
837 ; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
838 ; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm4
839 ; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
840 ; AVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm4
841 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
842 ; AVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm3
843 ; AVX1-NEXT: vpaddd %xmm3, %xmm8, %xmm3
844 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
845 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
846 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
847 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
848 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
849 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
850 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
851 ; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
852 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
853 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
854 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
855 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
856 ; AVX1-NEXT: vmovups %ymm0, (%rax)
857 ; AVX1-NEXT: vzeroupper
858 ; AVX1-NEXT: retq
859 ;
860 ; AVX2-LABEL: avg_v16i16:
861 ; AVX2: # BB#0:
862 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
863 ; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
864 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
865 ; AVX2-NEXT: vzeroupper
866 ; AVX2-NEXT: retq
867 ;
868 ; AVX512-LABEL: avg_v16i16:
869 ; AVX512: # BB#0:
870 ; AVX512-NEXT: vmovdqa (%rsi), %ymm0
871 ; AVX512-NEXT: vpavgw (%rdi), %ymm0, %ymm0
872 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
873 ; AVX512-NEXT: vzeroupper
874 ; AVX512-NEXT: retq
875 %1 = load <16 x i16>, <16 x i16>* %a
876 %2 = load <16 x i16>, <16 x i16>* %b
877 %3 = zext <16 x i16> %1 to <16 x i32>
878 %4 = zext <16 x i16> %2 to <16 x i32>
879 %5 = add nuw nsw <16 x i32> %3,
880 %6 = add nuw nsw <16 x i32> %5, %4
881 %7 = lshr <16 x i32> %6,
882 %8 = trunc <16 x i32> %7 to <16 x i16>
883 store <16 x i16> %8, <16 x i16>* undef, align 4
884 ret void
885 }
886
887 define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
888 ; SSE2-LABEL: avg_v32i16:
889 ; SSE2: # BB#0:
890 ; SSE2-NEXT: movdqa (%rdi), %xmm11
891 ; SSE2-NEXT: movdqa 16(%rdi), %xmm10
892 ; SSE2-NEXT: movdqa 32(%rdi), %xmm9
893 ; SSE2-NEXT: movdqa 48(%rdi), %xmm4
894 ; SSE2-NEXT: movdqa (%rsi), %xmm8
895 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
896 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2
897 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3
898 ; SSE2-NEXT: pxor %xmm0, %xmm0
899 ; SSE2-NEXT: movdqa %xmm11, %xmm15
900 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
901 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
902 ; SSE2-NEXT: movdqa %xmm10, %xmm14
903 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
904 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
905 ; SSE2-NEXT: movdqa %xmm9, %xmm13
906 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
907 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
908 ; SSE2-NEXT: movdqa %xmm4, %xmm12
909 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
910 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
911 ; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
912 ; SSE2-NEXT: movdqa %xmm8, %xmm7
913 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
914 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
915 ; SSE2-NEXT: movdqa %xmm1, %xmm6
916 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
917 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
918 ; SSE2-NEXT: movdqa %xmm2, %xmm5
919 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
920 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
921 ; SSE2-NEXT: movdqa %xmm3, %xmm4
922 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
923 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
924 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
925 ; SSE2-NEXT: paddd %xmm0, %xmm7
926 ; SSE2-NEXT: paddd %xmm15, %xmm7
927 ; SSE2-NEXT: paddd %xmm0, %xmm8
928 ; SSE2-NEXT: paddd %xmm11, %xmm8
929 ; SSE2-NEXT: paddd %xmm0, %xmm6
930 ; SSE2-NEXT: paddd %xmm14, %xmm6
931 ; SSE2-NEXT: paddd %xmm0, %xmm1
932 ; SSE2-NEXT: paddd %xmm10, %xmm1
933 ; SSE2-NEXT: paddd %xmm0, %xmm5
934 ; SSE2-NEXT: paddd %xmm13, %xmm5
935 ; SSE2-NEXT: paddd %xmm0, %xmm2
936 ; SSE2-NEXT: paddd %xmm9, %xmm2
937 ; SSE2-NEXT: paddd %xmm0, %xmm4
938 ; SSE2-NEXT: paddd %xmm12, %xmm4
939 ; SSE2-NEXT: paddd %xmm0, %xmm3
940 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
941 ; SSE2-NEXT: psrld $1, %xmm8
942 ; SSE2-NEXT: psrld $1, %xmm7
943 ; SSE2-NEXT: pslld $16, %xmm7
944 ; SSE2-NEXT: psrad $16, %xmm7
945 ; SSE2-NEXT: pslld $16, %xmm8
946 ; SSE2-NEXT: psrad $16, %xmm8
947 ; SSE2-NEXT: packssdw %xmm7, %xmm8
948 ; SSE2-NEXT: psrld $1, %xmm1
949 ; SSE2-NEXT: psrld $1, %xmm6
950 ; SSE2-NEXT: pslld $16, %xmm6
951 ; SSE2-NEXT: psrad $16, %xmm6
952 ; SSE2-NEXT: pslld $16, %xmm1
953 ; SSE2-NEXT: psrad $16, %xmm1
954 ; SSE2-NEXT: packssdw %xmm6, %xmm1
955 ; SSE2-NEXT: psrld $1, %xmm2
956 ; SSE2-NEXT: psrld $1, %xmm5
957 ; SSE2-NEXT: pslld $16, %xmm5
958 ; SSE2-NEXT: psrad $16, %xmm5
959 ; SSE2-NEXT: pslld $16, %xmm2
960 ; SSE2-NEXT: psrad $16, %xmm2
961 ; SSE2-NEXT: packssdw %xmm5, %xmm2
962 ; SSE2-NEXT: psrld $1, %xmm3
963 ; SSE2-NEXT: psrld $1, %xmm4
964 ; SSE2-NEXT: pslld $16, %xmm4
965 ; SSE2-NEXT: psrad $16, %xmm4
966 ; SSE2-NEXT: pslld $16, %xmm3
967 ; SSE2-NEXT: psrad $16, %xmm3
968 ; SSE2-NEXT: packssdw %xmm4, %xmm3
969 ; SSE2-NEXT: movdqu %xmm3, (%rax)
970 ; SSE2-NEXT: movdqu %xmm2, (%rax)
971 ; SSE2-NEXT: movdqu %xmm1, (%rax)
972 ; SSE2-NEXT: movdqu %xmm8, (%rax)
973 ; SSE2-NEXT: retq
974 ;
975 ; AVX1-LABEL: avg_v32i16:
976 ; AVX1: # BB#0:
977 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
978 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
979 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
980 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
981 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
982 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
983 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
984 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
985 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1]
986 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
987 ; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm0
988 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
989 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
990 ; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm1
991 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
992 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
993 ; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
994 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
995 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
996 ; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3
997 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
998 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
999 ; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm9
1000 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1001 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
1002 ; AVX1-NEXT: vpaddd %xmm7, %xmm5, %xmm5
1003 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1004 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
1005 ; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7
1006 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1007 ; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4
1008 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1009 ; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm4
1010 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
1011 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
1012 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
1013 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2],xmm6[3],xmm0[4],xmm6[5],xmm0[6],xmm6[7]
1014 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2],xmm6[3],xmm1[4],xmm6[5],xmm1[6],xmm6[7]
1015 ; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
1016 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm1
1017 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
1018 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2],xmm6[3],xmm2[4],xmm6[5],xmm2[6],xmm6[7]
1019 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2],xmm6[3],xmm1[4],xmm6[5],xmm1[6],xmm6[7]
1020 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1021 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1022 ; AVX1-NEXT: vpsrld $1, %xmm5, %xmm1
1023 ; AVX1-NEXT: vpsrld $1, %xmm9, %xmm2
1024 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2],xmm6[3],xmm2[4],xmm6[5],xmm2[6],xmm6[7]
1025 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2],xmm6[3],xmm1[4],xmm6[5],xmm1[6],xmm6[7]
1026 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1027 ; AVX1-NEXT: vpsrld $1, %xmm4, %xmm2
1028 ; AVX1-NEXT: vpsrld $1, %xmm7, %xmm3
1029 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2],xmm6[3],xmm3[4],xmm6[5],xmm3[6],xmm6[7]
1030 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2],xmm6[3],xmm2[4],xmm6[5],xmm2[6],xmm6[7]
1031 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1032 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1033 ; AVX1-NEXT: vmovups %ymm1, (%rax)
1034 ; AVX1-NEXT: vmovups %ymm0, (%rax)
1035 ; AVX1-NEXT: vzeroupper
1036 ; AVX1-NEXT: retq
1037 ;
1038 ; AVX2-LABEL: avg_v32i16:
1039 ; AVX2: # BB#0:
1040 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1041 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1042 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1043 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1044 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1045 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1046 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1047 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1048 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm8
607 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0
608 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
609 ; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1
610 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
611 ; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2
612 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
613 ; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3
614 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1049615 ; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4
1050 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
1051 ; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm4
1052 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
1053 ; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm4
1054 ; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2
1055 ; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm4
1056 ; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3
1057 ; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
1058 ; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
1059 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
1060 ; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
1061 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1062 ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1063 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1064 ; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1
1065 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1066 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1067 ; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm1
1068 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1069 ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm2
1070 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1071 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
1072 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
1073 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1074 ; AVX2-NEXT: vzeroupper
1075 ; AVX2-NEXT: retq
1076 ;
1077 ; AVX512F-LABEL: avg_v32i16:
1078 ; AVX512F: # BB#0:
1079 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1080 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1081 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1082 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1083 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm4
1084 ; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2
1085 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
1086 ; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm2
1087 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
1088 ; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1
1089 ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0
1090 ; AVX512F-NEXT: vpmovdw %zmm0, (%rax)
1091 ; AVX512F-NEXT: vpmovdw %zmm1, (%rax)
1092 ; AVX512F-NEXT: vzeroupper
1093 ; AVX512F-NEXT: retq
1094 ;
1095 ; AVX512BW-LABEL: avg_v32i16:
1096 ; AVX512BW: # BB#0:
1097 ; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0
1098 ; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
1099 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
1100 ; AVX512BW-NEXT: vzeroupper
1101 ; AVX512BW-NEXT: retq
1102 %1 = load <32 x i16>, <32 x i16>* %a
1103 %2 = load <32 x i16>, <32 x i16>* %b
1104 %3 = zext <32 x i16> %1 to <32 x i32>
1105 %4 = zext <32 x i16> %2 to <32 x i32>
1106 %5 = add nuw nsw <32 x i32> %3,
1107 %6 = add nuw nsw <32 x i32> %5, %4
1108 %7 = lshr <32 x i32> %6,
1109 %8 = trunc <32 x i32> %7 to <32 x i16>
1110 store <32 x i16> %8, <32 x i16>* undef, align 4
1111 ret void
1112 }
1113
1114 define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) {
1115 ; SSE2-LABEL: avg_v4i8_2:
1116 ; SSE2: # BB#0:
1117 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1118 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1119 ; SSE2-NEXT: pavgb %xmm0, %xmm1
1120 ; SSE2-NEXT: movd %xmm1, (%rax)
1121 ; SSE2-NEXT: retq
1122 ;
1123 ; AVX-LABEL: avg_v4i8_2:
1124 ; AVX: # BB#0:
1125 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1126 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1127 ; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
1128 ; AVX-NEXT: vmovd %xmm0, (%rax)
1129 ; AVX-NEXT: retq
1130 %1 = load <4 x i8>, <4 x i8>* %a
1131 %2 = load <4 x i8>, <4 x i8>* %b
1132 %3 = zext <4 x i8> %1 to <4 x i32>
1133 %4 = zext <4 x i8> %2 to <4 x i32>
1134 %5 = add nuw nsw <4 x i32> %3, %4
1135 %6 = add nuw nsw <4 x i32> %5,
1136 %7 = lshr <4 x i32> %6,
1137 %8 = trunc <4 x i32> %7 to <4 x i8>
1138 store <4 x i8> %8, <4 x i8>* undef, align 4
1139 ret void
1140 }
1141
1142 define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) {
1143 ; SSE2-LABEL: avg_v8i8_2:
1144 ; SSE2: # BB#0:
1145 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1146 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
1147 ; SSE2-NEXT: pavgb %xmm0, %xmm1
1148 ; SSE2-NEXT: movq %xmm1, (%rax)
1149 ; SSE2-NEXT: retq
1150 ;
1151 ; AVX-LABEL: avg_v8i8_2:
1152 ; AVX: # BB#0:
1153 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1154 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1155 ; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
1156 ; AVX-NEXT: vmovq %xmm0, (%rax)
1157 ; AVX-NEXT: retq
1158 %1 = load <8 x i8>, <8 x i8>* %a
1159 %2 = load <8 x i8>, <8 x i8>* %b
1160 %3 = zext <8 x i8> %1 to <8 x i32>
1161 %4 = zext <8 x i8> %2 to <8 x i32>
1162 %5 = add nuw nsw <8 x i32> %3, %4
1163 %6 = add nuw nsw <8 x i32> %5,
1164 %7 = lshr <8 x i32> %6,
1165 %8 = trunc <8 x i32> %7 to <8 x i8>
1166 store <8 x i8> %8, <8 x i8>* undef, align 4
1167 ret void
1168 }
1169
1170 define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
1171 ; SSE2-LABEL: avg_v16i8_2:
1172 ; SSE2: # BB#0:
1173 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1174 ; SSE2-NEXT: pavgb (%rsi), %xmm0
1175 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1176 ; SSE2-NEXT: retq
1177 ;
1178 ; AVX-LABEL: avg_v16i8_2:
1179 ; AVX: # BB#0:
1180 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1181 ; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0
1182 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
1183 ; AVX-NEXT: retq
1184 %1 = load <16 x i8>, <16 x i8>* %a
1185 %2 = load <16 x i8>, <16 x i8>* %b
1186 %3 = zext <16 x i8> %1 to <16 x i32>
1187 %4 = zext <16 x i8> %2 to <16 x i32>
1188 %5 = add nuw nsw <16 x i32> %3, %4
1189 %6 = add nuw nsw <16 x i32> %5,
1190 %7 = lshr <16 x i32> %6,
1191 %8 = trunc <16 x i32> %7 to <16 x i8>
1192 store <16 x i8> %8, <16 x i8>* undef, align 4
1193 ret void
1194 }
1195
1196 define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
1197 ; SSE2-LABEL: avg_v32i8_2:
1198 ; SSE2: # BB#0:
1199 ; SSE2-NEXT: movdqa (%rdi), %xmm9
1200 ; SSE2-NEXT: movdqa 16(%rdi), %xmm12
1201 ; SSE2-NEXT: movdqa (%rsi), %xmm4
1202 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
1203 ; SSE2-NEXT: pxor %xmm0, %xmm0
1204 ; SSE2-NEXT: movdqa %xmm9, %xmm11
1205 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15]
1206 ; SSE2-NEXT: movdqa %xmm11, %xmm15
1207 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
1208 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
1209 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
1210 ; SSE2-NEXT: movdqa %xmm9, %xmm14
1211 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
1212 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
1213 ; SSE2-NEXT: movdqa %xmm12, %xmm10
1214 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
1215 ; SSE2-NEXT: movdqa %xmm10, %xmm13
1216 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
1217 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
1218 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
1219 ; SSE2-NEXT: movdqa %xmm12, %xmm2
1220 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1221 ; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
1222 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
1223 ; SSE2-NEXT: movdqa %xmm4, %xmm3
1224 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
1225 ; SSE2-NEXT: movdqa %xmm3, %xmm7
1226 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
1227 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1228 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
1229 ; SSE2-NEXT: movdqa %xmm4, %xmm5
1230 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
1231 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
1232 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1233 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
1234 ; SSE2-NEXT: movdqa %xmm2, %xmm8
1235 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
1236 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1237 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1238 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1239 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
1240 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1241 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
1242 ; SSE2-NEXT: paddd %xmm0, %xmm7
1243 ; SSE2-NEXT: paddd %xmm15, %xmm7
1244 ; SSE2-NEXT: paddd %xmm0, %xmm3
1245 ; SSE2-NEXT: paddd %xmm11, %xmm3
1246 ; SSE2-NEXT: paddd %xmm0, %xmm5
1247 ; SSE2-NEXT: paddd %xmm14, %xmm5
1248 ; SSE2-NEXT: paddd %xmm0, %xmm4
1249 ; SSE2-NEXT: paddd %xmm9, %xmm4
1250 ; SSE2-NEXT: paddd %xmm0, %xmm8
1251 ; SSE2-NEXT: paddd %xmm13, %xmm8
1252 ; SSE2-NEXT: paddd %xmm0, %xmm2
1253 ; SSE2-NEXT: paddd %xmm10, %xmm2
1254 ; SSE2-NEXT: paddd %xmm0, %xmm6
1255 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
1256 ; SSE2-NEXT: paddd %xmm0, %xmm1
1257 ; SSE2-NEXT: paddd %xmm12, %xmm1
1258 ; SSE2-NEXT: psrld $1, %xmm3
1259 ; SSE2-NEXT: psrld $1, %xmm7
1260 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1261 ; SSE2-NEXT: pand %xmm0, %xmm7
1262 ; SSE2-NEXT: pand %xmm0, %xmm3
1263 ; SSE2-NEXT: packuswb %xmm7, %xmm3
1264 ; SSE2-NEXT: psrld $1, %xmm4
1265 ; SSE2-NEXT: psrld $1, %xmm5
1266 ; SSE2-NEXT: pand %xmm0, %xmm5
1267 ; SSE2-NEXT: pand %xmm0, %xmm4
1268 ; SSE2-NEXT: packuswb %xmm5, %xmm4
1269 ; SSE2-NEXT: packuswb %xmm3, %xmm4
1270 ; SSE2-NEXT: psrld $1, %xmm2
1271 ; SSE2-NEXT: psrld $1, %xmm8
1272 ; SSE2-NEXT: pand %xmm0, %xmm8
1273 ; SSE2-NEXT: pand %xmm0, %xmm2
1274 ; SSE2-NEXT: packuswb %xmm8, %xmm2
1275 ; SSE2-NEXT: psrld $1, %xmm1
1276 ; SSE2-NEXT: psrld $1, %xmm6
1277 ; SSE2-NEXT: pand %xmm0, %xmm6
1278 ; SSE2-NEXT: pand %xmm0, %xmm1
1279 ; SSE2-NEXT: packuswb %xmm6, %xmm1
1280 ; SSE2-NEXT: packuswb %xmm2, %xmm1
1281 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1282 ; SSE2-NEXT: movdqu %xmm4, (%rax)
1283 ; SSE2-NEXT: retq
1284 ;
1285 ; AVX1-LABEL: avg_v32i8_2:
1286 ; AVX1: # BB#0:
1287 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1288 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1289 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1290 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1291 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1292 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1293 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1294 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1295 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1]
1296 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
1297 ; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm0
1298 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1299 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
1300 ; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm1
1301 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1302 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
1303 ; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm9
1304 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1305 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
1306 ; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3
1307 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1308 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
1309 ; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
1310 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1311 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
1312 ; AVX1-NEXT: vpaddd %xmm7, %xmm5, %xmm5
1313 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1314 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
1315 ; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7
1316 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1317 ; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2
1318 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1319 ; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2
1320 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
1321 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
1322 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1323 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
1324 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
1325 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
1326 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm1
1327 ; AVX1-NEXT: vpsrld $1, %xmm9, %xmm3
1328 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
1329 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
1330 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
1331 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
1332 ; AVX1-NEXT: vpsrld $1, %xmm5, %xmm1
1333 ; AVX1-NEXT: vpsrld $1, %xmm4, %xmm3
1334 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
1335 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
1336 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
1337 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
1338 ; AVX1-NEXT: vpsrld $1, %xmm7, %xmm3
1339 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
1340 ; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
1341 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
1342 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
1343 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1344 ; AVX1-NEXT: vmovups %ymm0, (%rax)
1345 ; AVX1-NEXT: vzeroupper
1346 ; AVX1-NEXT: retq
1347 ;
1348 ; AVX2-LABEL: avg_v32i8_2:
1349 ; AVX2: # BB#0:
1350 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1351 ; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0
1352 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1353 ; AVX2-NEXT: vzeroupper
1354 ; AVX2-NEXT: retq
1355 ;
1356 ; AVX512-LABEL: avg_v32i8_2:
1357 ; AVX512: # BB#0:
1358 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
1359 ; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0
1360 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
1361 ; AVX512-NEXT: vzeroupper
1362 ; AVX512-NEXT: retq
1363 %1 = load <32 x i8>, <32 x i8>* %a
1364 %2 = load <32 x i8>, <32 x i8>* %b
1365 %3 = zext <32 x i8> %1 to <32 x i32>
1366 %4 = zext <32 x i8> %2 to <32 x i32>
1367 %5 = add nuw nsw <32 x i32> %3, %4
1368 %6 = add nuw nsw <32 x i32> %5,
1369 %7 = lshr <32 x i32> %6,
1370 %8 = trunc <32 x i32> %7 to <32 x i8>
1371 store <32 x i8> %8, <32 x i8>* undef, align 4
1372 ret void
1373 }
1374
1375 define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
1376 ; SSE2-LABEL: avg_v64i8_2:
1377 ; SSE2: # BB#0:
1378 ; SSE2-NEXT: movdqa (%rsi), %xmm2
1379 ; SSE2-NEXT: movdqa 16(%rsi), %xmm7
1380 ; SSE2-NEXT: movdqa 32(%rsi), %xmm15
1381 ; SSE2-NEXT: movdqa 48(%rsi), %xmm1
1382 ; SSE2-NEXT: pxor %xmm0, %xmm0
1383 ; SSE2-NEXT: movdqa %xmm2, %xmm6
1384 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
1385 ; SSE2-NEXT: movdqa %xmm6, %xmm9
1386 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
1387 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
1388 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1389 ; SSE2-NEXT: movdqa %xmm2, %xmm10
1390 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
1391 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1392 ; SSE2-NEXT: movdqa %xmm7, %xmm3
1393 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
1394 ; SSE2-NEXT: movdqa %xmm3, %xmm12
1395 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
1396 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1397 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
1398 ; SSE2-NEXT: movdqa %xmm7, %xmm13
1399 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
1400 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
1401 ; SSE2-NEXT: movdqa %xmm15, %xmm4
1402 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
1403 ; SSE2-NEXT: movdqa %xmm4, %xmm14
1404 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
1405 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
1406 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
1407 ; SSE2-NEXT: movdqa %xmm15, %xmm5
1408 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
1409 ; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
1410 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
1411 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1412 ; SSE2-NEXT: movdqa %xmm5, %xmm8
1413 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
1414 ; SSE2-NEXT: movdqa %xmm8, %xmm1
1415 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1416 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
1417 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
1418 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
1419 ; SSE2-NEXT: movdqa %xmm5, %xmm1
1420 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1421 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
1422 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
1423 ; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
1424 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1]
1425 ; SSE2-NEXT: movdqa %xmm9, %xmm0
1426 ; SSE2-NEXT: paddd %xmm1, %xmm0
1427 ; SSE2-NEXT: paddd %xmm9, %xmm0
1428 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
1429 ; SSE2-NEXT: movdqa %xmm6, %xmm9
1430 ; SSE2-NEXT: paddd %xmm1, %xmm9
1431 ; SSE2-NEXT: paddd %xmm6, %xmm9
1432 ; SSE2-NEXT: movdqa %xmm10, %xmm11
1433 ; SSE2-NEXT: paddd %xmm1, %xmm11
1434 ; SSE2-NEXT: paddd %xmm10, %xmm11
1435 ; SSE2-NEXT: movdqa %xmm2, %xmm10
1436 ; SSE2-NEXT: paddd %xmm1, %xmm10
1437 ; SSE2-NEXT: paddd %xmm2, %xmm10
1438 ; SSE2-NEXT: movdqa %xmm12, %xmm0
1439 ; SSE2-NEXT: paddd %xmm1, %xmm0
1440 ; SSE2-NEXT: paddd %xmm12, %xmm0
1441 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
1442 ; SSE2-NEXT: movdqa %xmm3, %xmm6
1443 ; SSE2-NEXT: paddd %xmm1, %xmm6
1444 ; SSE2-NEXT: paddd %xmm3, %xmm6
1445 ; SSE2-NEXT: movdqa %xmm13, %xmm12
1446 ; SSE2-NEXT: paddd %xmm1, %xmm12
1447 ; SSE2-NEXT: paddd %xmm13, %xmm12
1448 ; SSE2-NEXT: movdqa %xmm7, %xmm5
1449 ; SSE2-NEXT: paddd %xmm1, %xmm5
1450 ; SSE2-NEXT: paddd %xmm7, %xmm5
1451 ; SSE2-NEXT: movdqa %xmm14, %xmm13
1452 ; SSE2-NEXT: paddd %xmm1, %xmm13
1453 ; SSE2-NEXT: paddd %xmm14, %xmm13
1454 ; SSE2-NEXT: movdqa %xmm4, %xmm3
1455 ; SSE2-NEXT: paddd %xmm1, %xmm3
1456 ; SSE2-NEXT: paddd %xmm4, %xmm3
1457 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
1458 ; SSE2-NEXT: movdqa %xmm0, %xmm14
1459 ; SSE2-NEXT: paddd %xmm1, %xmm14
1460 ; SSE2-NEXT: paddd %xmm0, %xmm14
1461 ; SSE2-NEXT: movdqa %xmm15, %xmm4
1462 ; SSE2-NEXT: paddd %xmm1, %xmm4
1463 ; SSE2-NEXT: paddd %xmm15, %xmm4
1464 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
1465 ; SSE2-NEXT: movdqa %xmm0, %xmm15
1466 ; SSE2-NEXT: paddd %xmm1, %xmm15
1467 ; SSE2-NEXT: paddd %xmm0, %xmm15
1468 ; SSE2-NEXT: movdqa %xmm8, %xmm7
1469 ; SSE2-NEXT: paddd %xmm1, %xmm7
1470 ; SSE2-NEXT: paddd %xmm8, %xmm7
1471 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
1472 ; SSE2-NEXT: movdqa %xmm0, %xmm8
1473 ; SSE2-NEXT: paddd %xmm1, %xmm8
1474 ; SSE2-NEXT: paddd %xmm0, %xmm8
1475 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
1476 ; SSE2-NEXT: paddd %xmm0, %xmm1
1477 ; SSE2-NEXT: paddd %xmm0, %xmm1
1478 ; SSE2-NEXT: psrld $1, %xmm9
1479 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
1480 ; SSE2-NEXT: psrld $1, %xmm0
1481 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1482 ; SSE2-NEXT: pand %xmm2, %xmm0
1483 ; SSE2-NEXT: pand %xmm2, %xmm9
1484 ; SSE2-NEXT: packuswb %xmm0, %xmm9
1485 ; SSE2-NEXT: psrld $1, %xmm10
1486 ; SSE2-NEXT: psrld $1, %xmm11
1487 ; SSE2-NEXT: pand %xmm2, %xmm11
1488 ; SSE2-NEXT: pand %xmm2, %xmm10
1489 ; SSE2-NEXT: packuswb %xmm11, %xmm10
1490 ; SSE2-NEXT: packuswb %xmm9, %xmm10
1491 ; SSE2-NEXT: psrld $1, %xmm6
1492 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
1493 ; SSE2-NEXT: psrld $1, %xmm0
1494 ; SSE2-NEXT: pand %xmm2, %xmm0
1495 ; SSE2-NEXT: pand %xmm2, %xmm6
1496 ; SSE2-NEXT: packuswb %xmm0, %xmm6
1497 ; SSE2-NEXT: psrld $1, %xmm5
1498 ; SSE2-NEXT: psrld $1, %xmm12
1499 ; SSE2-NEXT: pand %xmm2, %xmm12
1500 ; SSE2-NEXT: pand %xmm2, %xmm5
1501 ; SSE2-NEXT: packuswb %xmm12, %xmm5
1502 ; SSE2-NEXT: packuswb %xmm6, %xmm5
1503 ; SSE2-NEXT: psrld $1, %xmm3
1504 ; SSE2-NEXT: psrld $1, %xmm13
1505 ; SSE2-NEXT: pand %xmm2, %xmm13
1506 ; SSE2-NEXT: pand %xmm2, %xmm3
1507 ; SSE2-NEXT: packuswb %xmm13, %xmm3
1508 ; SSE2-NEXT: psrld $1, %xmm4
1509 ; SSE2-NEXT: psrld $1, %xmm14
1510 ; SSE2-NEXT: pand %xmm2, %xmm14
1511 ; SSE2-NEXT: pand %xmm2, %xmm4
1512 ; SSE2-NEXT: packuswb %xmm14, %xmm4
1513 ; SSE2-NEXT: packuswb %xmm3, %xmm4
1514 ; SSE2-NEXT: psrld $1, %xmm7
1515 ; SSE2-NEXT: psrld $1, %xmm15
1516 ; SSE2-NEXT: pand %xmm2, %xmm15
1517 ; SSE2-NEXT: pand %xmm2, %xmm7
1518 ; SSE2-NEXT: packuswb %xmm15, %xmm7
1519 ; SSE2-NEXT: psrld $1, %xmm1
1520 ; SSE2-NEXT: psrld $1, %xmm8
1521 ; SSE2-NEXT: pand %xmm2, %xmm8
1522 ; SSE2-NEXT: pand %xmm2, %xmm1
1523 ; SSE2-NEXT: packuswb %xmm8, %xmm1
1524 ; SSE2-NEXT: packuswb %xmm7, %xmm1
1525 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1526 ; SSE2-NEXT: movdqu %xmm4, (%rax)
1527 ; SSE2-NEXT: movdqu %xmm5, (%rax)
1528 ; SSE2-NEXT: movdqu %xmm10, (%rax)
1529 ; SSE2-NEXT: retq
1530 ;
1531 ; AVX1-LABEL: avg_v64i8_2:
1532 ; AVX1: # BB#0:
1533 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1534 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1535 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1536 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1537 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1538 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1539 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1540 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1541 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1542 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1543 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1544 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1545 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1546 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1547 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1]
1548 ; AVX1-NEXT: vpaddd %xmm6, %xmm0, %xmm7
1549 ; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm0
1550 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
1551 ; AVX1-NEXT: vpaddd %xmm6, %xmm1, %xmm0
1552 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1553 ; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm1
1554 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
1555 ; AVX1-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
1556 ; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm1
1557 ; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
1558 ; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm2
1559 ; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
1560 ; AVX1-NEXT: vmovdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
1561 ; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm2
1562 ; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2
1563 ; AVX1-NEXT: vpaddd %xmm6, %xmm14, %xmm3
1564 ; AVX1-NEXT: vpaddd %xmm3, %xmm14, %xmm14
1565 ; AVX1-NEXT: vpaddd %xmm6, %xmm8, %xmm3
1566 ; AVX1-NEXT: vpaddd %xmm3, %xmm8, %xmm5
1567 ; AVX1-NEXT: vpaddd %xmm6, %xmm11, %xmm3
1568 ; AVX1-NEXT: vpaddd %xmm3, %xmm11, %xmm3
1569 ; AVX1-NEXT: vmovdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
1570 ; AVX1-NEXT: vpaddd %xmm6, %xmm12, %xmm3
1571 ; AVX1-NEXT: vpaddd %xmm3, %xmm12, %xmm8
1572 ; AVX1-NEXT: vpaddd %xmm6, %xmm15, %xmm3
1573 ; AVX1-NEXT: vpaddd %xmm3, %xmm15, %xmm11
1574 ; AVX1-NEXT: vpaddd %xmm6, %xmm13, %xmm3
1575 ; AVX1-NEXT: vpaddd %xmm3, %xmm13, %xmm13
1576 ; AVX1-NEXT: vpaddd %xmm6, %xmm9, %xmm4
1577 ; AVX1-NEXT: vpaddd %xmm4, %xmm9, %xmm12
1578 ; AVX1-NEXT: vpaddd %xmm6, %xmm10, %xmm4
1579 ; AVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm4
1580 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1581 ; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm7
1582 ; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm15
1583 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1584 ; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm6
1585 ; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm6
1586 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
1587 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
1588 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
1589 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1590 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1591 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
1592 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1593 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
1594 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
1595 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
1596 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1597 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
1598 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
1599 ; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
1600 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1
1601 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
1602 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
1603 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
1604 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
1605 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
1606 ; AVX1-NEXT: vpsrld $1, %xmm5, %xmm2
1607 ; AVX1-NEXT: vpsrld $1, %xmm14, %xmm3
1608 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1609 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
1610 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
1611 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
1612 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1613 ; AVX1-NEXT: vpsrld $1, %xmm8, %xmm1
1614 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
1615 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
1616 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
1617 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
1618 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
1619 ; AVX1-NEXT: vpsrld $1, %xmm13, %xmm2
1620 ; AVX1-NEXT: vpsrld $1, %xmm11, %xmm3
1621 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1622 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
1623 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
1624 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
1625 ; AVX1-NEXT: vpsrld $1, %xmm4, %xmm2
1626 ; AVX1-NEXT: vpsrld $1, %xmm12, %xmm3
1627 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1628 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
1629 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
1630 ; AVX1-NEXT: vpsrld $1, %xmm6, %xmm3
1631 ; AVX1-NEXT: vpsrld $1, %xmm15, %xmm4
1632 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4
1633 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1634 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
1635 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
1636 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1637 ; AVX1-NEXT: vmovups %ymm1, (%rax)
1638 ; AVX1-NEXT: vmovups %ymm0, (%rax)
1639 ; AVX1-NEXT: vzeroupper
1640 ; AVX1-NEXT: retq
1641 ;
1642 ; AVX2-LABEL: avg_v64i8_2:
1643 ; AVX2: # BB#0:
1644 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1645 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1646 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1647 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1648 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1649 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1650 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1651 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1652 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm8
1653 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm9
1654 ; AVX2-NEXT: vpaddd %ymm9, %ymm0, %ymm9
1655 ; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm0
1656 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm10
1657 ; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm0
1658 ; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm2
1659 ; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm0
1660 ; AVX2-NEXT: vpaddd %ymm0, %ymm3, %ymm3
1661 ; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm0
1662 ; AVX2-NEXT: vpaddd %ymm0, %ymm4, %ymm4
1663 ; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm0
1664 ; AVX2-NEXT: vpaddd %ymm0, %ymm5, %ymm5
1665 ; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm0
1666 ; AVX2-NEXT: vpaddd %ymm0, %ymm6, %ymm1
1667 ; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm0
1668 ; AVX2-NEXT: vpaddd %ymm0, %ymm7, %ymm0
616 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
617 ; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5
618 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
619 ; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6
620 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
621 ; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm7
622 ; AVX2-NEXT: vpcmpeqd %ymm8, %ymm8, %ymm8
623 ; AVX2-NEXT: vpsubd %ymm8, %ymm0, %ymm9
624 ; AVX2-NEXT: vpsubd %ymm8, %ymm1, %ymm10
625 ; AVX2-NEXT: vpsubd %ymm8, %ymm2, %ymm2
626 ; AVX2-NEXT: vpsubd %ymm8, %ymm3, %ymm3
627 ; AVX2-NEXT: vpsubd %ymm8, %ymm4, %ymm4
628 ; AVX2-NEXT: vpsubd %ymm8, %ymm5, %ymm5
629 ; AVX2-NEXT: vpsubd %ymm8, %ymm6, %ymm1
630 ; AVX2-NEXT: vpsubd %ymm8, %ymm7, %ymm0
1669631 ; AVX2-NEXT: vpsrld $1, %ymm0, %ymm11
1670632 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm12
1671633 ; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5
1711673 ; AVX2-NEXT: vzeroupper
1712674 ; AVX2-NEXT: retq
1713675 ;
676 ; AVX512F-LABEL: avg_v64i8:
677 ; AVX512F: # BB#0:
678 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
679 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
680 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
681 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
682 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
683 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0
684 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
685 ; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1
686 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
687 ; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2
688 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
689 ; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3
690 ; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
691 ; AVX512F-NEXT: vpsubd %zmm4, %zmm0, %zmm0
692 ; AVX512F-NEXT: vpsubd %zmm4, %zmm1, %zmm1
693 ; AVX512F-NEXT: vpsubd %zmm4, %zmm2, %zmm2
694 ; AVX512F-NEXT: vpsubd %zmm4, %zmm3, %zmm3
695 ; AVX512F-NEXT: vpsrld $1, %zmm3, %zmm3
696 ; AVX512F-NEXT: vpsrld $1, %zmm2, %zmm2
697 ; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1
698 ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0
699 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
700 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
701 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
702 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm1
703 ; AVX512F-NEXT: vpmovdb %zmm3, %xmm2
704 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
705 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
706 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
707 ; AVX512F-NEXT: vzeroupper
708 ; AVX512F-NEXT: retq
709 ;
710 ; AVX512BW-LABEL: avg_v64i8:
711 ; AVX512BW: # BB#0:
712 ; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
713 ; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
714 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
715 ; AVX512BW-NEXT: vzeroupper
716 ; AVX512BW-NEXT: retq
717 %1 = load <64 x i8>, <64 x i8>* %a
718 %2 = load <64 x i8>, <64 x i8>* %b
719 %3 = zext <64 x i8> %1 to <64 x i32>
720 %4 = zext <64 x i8> %2 to <64 x i32>
721 %5 = add nuw nsw <64 x i32> %3,
722 %6 = add nuw nsw <64 x i32> %5, %4
723 %7 = lshr <64 x i32> %6,
724 %8 = trunc <64 x i32> %7 to <64 x i8>
725 store <64 x i8> %8, <64 x i8>* undef, align 4
726 ret void
727 }
728
729 define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) {
730 ; SSE2-LABEL: avg_v4i16:
731 ; SSE2: # BB#0:
732 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
733 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
734 ; SSE2-NEXT: pavgw %xmm0, %xmm1
735 ; SSE2-NEXT: movq %xmm1, (%rax)
736 ; SSE2-NEXT: retq
737 ;
738 ; AVX-LABEL: avg_v4i16:
739 ; AVX: # BB#0:
740 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
741 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
742 ; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0
743 ; AVX-NEXT: vmovq %xmm0, (%rax)
744 ; AVX-NEXT: retq
745 %1 = load <4 x i16>, <4 x i16>* %a
746 %2 = load <4 x i16>, <4 x i16>* %b
747 %3 = zext <4 x i16> %1 to <4 x i32>
748 %4 = zext <4 x i16> %2 to <4 x i32>
749 %5 = add nuw nsw <4 x i32> %3,
750 %6 = add nuw nsw <4 x i32> %5, %4
751 %7 = lshr <4 x i32> %6,
752 %8 = trunc <4 x i32> %7 to <4 x i16>
753 store <4 x i16> %8, <4 x i16>* undef, align 4
754 ret void
755 }
756
757 define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
758 ; SSE2-LABEL: avg_v8i16:
759 ; SSE2: # BB#0:
760 ; SSE2-NEXT: movdqa (%rsi), %xmm0
761 ; SSE2-NEXT: pavgw (%rdi), %xmm0
762 ; SSE2-NEXT: movdqu %xmm0, (%rax)
763 ; SSE2-NEXT: retq
764 ;
765 ; AVX-LABEL: avg_v8i16:
766 ; AVX: # BB#0:
767 ; AVX-NEXT: vmovdqa (%rsi), %xmm0
768 ; AVX-NEXT: vpavgw (%rdi), %xmm0, %xmm0
769 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
770 ; AVX-NEXT: retq
771 %1 = load <8 x i16>, <8 x i16>* %a
772 %2 = load <8 x i16>, <8 x i16>* %b
773 %3 = zext <8 x i16> %1 to <8 x i32>
774 %4 = zext <8 x i16> %2 to <8 x i32>
775 %5 = add nuw nsw <8 x i32> %3,
776 %6 = add nuw nsw <8 x i32> %5, %4
777 %7 = lshr <8 x i32> %6,
778 %8 = trunc <8 x i32> %7 to <8 x i16>
779 store <8 x i16> %8, <8 x i16>* undef, align 4
780 ret void
781 }
782
783 define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
784 ; SSE2-LABEL: avg_v16i16:
785 ; SSE2: # BB#0:
786 ; SSE2-NEXT: movdqa (%rdi), %xmm2
787 ; SSE2-NEXT: movdqa 16(%rdi), %xmm4
788 ; SSE2-NEXT: movdqa (%rsi), %xmm0
789 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
790 ; SSE2-NEXT: pxor %xmm5, %xmm5
791 ; SSE2-NEXT: movdqa %xmm2, %xmm6
792 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
793 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
794 ; SSE2-NEXT: movdqa %xmm4, %xmm7
795 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
796 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
797 ; SSE2-NEXT: movdqa %xmm0, %xmm3
798 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
799 ; SSE2-NEXT: paddd %xmm6, %xmm3
800 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
801 ; SSE2-NEXT: paddd %xmm2, %xmm0
802 ; SSE2-NEXT: movdqa %xmm1, %xmm2
803 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
804 ; SSE2-NEXT: paddd %xmm7, %xmm2
805 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
806 ; SSE2-NEXT: paddd %xmm4, %xmm1
807 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
808 ; SSE2-NEXT: psubd %xmm4, %xmm3
809 ; SSE2-NEXT: psubd %xmm4, %xmm0
810 ; SSE2-NEXT: psubd %xmm4, %xmm2
811 ; SSE2-NEXT: psubd %xmm4, %xmm1
812 ; SSE2-NEXT: psrld $1, %xmm1
813 ; SSE2-NEXT: psrld $1, %xmm2
814 ; SSE2-NEXT: psrld $1, %xmm0
815 ; SSE2-NEXT: psrld $1, %xmm3
816 ; SSE2-NEXT: pslld $16, %xmm3
817 ; SSE2-NEXT: psrad $16, %xmm3
818 ; SSE2-NEXT: pslld $16, %xmm0
819 ; SSE2-NEXT: psrad $16, %xmm0
820 ; SSE2-NEXT: packssdw %xmm3, %xmm0
821 ; SSE2-NEXT: pslld $16, %xmm2
822 ; SSE2-NEXT: psrad $16, %xmm2
823 ; SSE2-NEXT: pslld $16, %xmm1
824 ; SSE2-NEXT: psrad $16, %xmm1
825 ; SSE2-NEXT: packssdw %xmm2, %xmm1
826 ; SSE2-NEXT: movdqu %xmm1, (%rax)
827 ; SSE2-NEXT: movdqu %xmm0, (%rax)
828 ; SSE2-NEXT: retq
829 ;
830 ; AVX1-LABEL: avg_v16i16:
831 ; AVX1: # BB#0:
832 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
833 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
834 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
835 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
836 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
837 ; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
838 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
839 ; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
840 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
841 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
842 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
843 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
844 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
845 ; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0
846 ; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm1
847 ; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2
848 ; AVX1-NEXT: vpsubd %xmm4, %xmm3, %xmm3
849 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
850 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
851 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
852 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
853 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
854 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
855 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
856 ; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
857 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
858 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
859 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
860 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
861 ; AVX1-NEXT: vmovups %ymm0, (%rax)
862 ; AVX1-NEXT: vzeroupper
863 ; AVX1-NEXT: retq
864 ;
865 ; AVX2-LABEL: avg_v16i16:
866 ; AVX2: # BB#0:
867 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
868 ; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
869 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
870 ; AVX2-NEXT: vzeroupper
871 ; AVX2-NEXT: retq
872 ;
873 ; AVX512-LABEL: avg_v16i16:
874 ; AVX512: # BB#0:
875 ; AVX512-NEXT: vmovdqa (%rsi), %ymm0
876 ; AVX512-NEXT: vpavgw (%rdi), %ymm0, %ymm0
877 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
878 ; AVX512-NEXT: vzeroupper
879 ; AVX512-NEXT: retq
880 %1 = load <16 x i16>, <16 x i16>* %a
881 %2 = load <16 x i16>, <16 x i16>* %b
882 %3 = zext <16 x i16> %1 to <16 x i32>
883 %4 = zext <16 x i16> %2 to <16 x i32>
884 %5 = add nuw nsw <16 x i32> %3,
885 %6 = add nuw nsw <16 x i32> %5, %4
886 %7 = lshr <16 x i32> %6,
887 %8 = trunc <16 x i32> %7 to <16 x i16>
888 store <16 x i16> %8, <16 x i16>* undef, align 4
889 ret void
890 }
891
892 define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
893 ; SSE2-LABEL: avg_v32i16:
894 ; SSE2: # BB#0:
895 ; SSE2-NEXT: movdqa (%rdi), %xmm4
896 ; SSE2-NEXT: movdqa 16(%rdi), %xmm11
897 ; SSE2-NEXT: movdqa 32(%rdi), %xmm10
898 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8
899 ; SSE2-NEXT: movdqa (%rsi), %xmm9
900 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
901 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2
902 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3
903 ; SSE2-NEXT: pxor %xmm0, %xmm0
904 ; SSE2-NEXT: movdqa %xmm4, %xmm6
905 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
906 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
907 ; SSE2-NEXT: movdqa %xmm11, %xmm5
908 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
909 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
910 ; SSE2-NEXT: movdqa %xmm10, %xmm12
911 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
912 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
913 ; SSE2-NEXT: movdqa %xmm8, %xmm13
914 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
915 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
916 ; SSE2-NEXT: movdqa %xmm9, %xmm7
917 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
918 ; SSE2-NEXT: paddd %xmm6, %xmm7
919 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
920 ; SSE2-NEXT: paddd %xmm4, %xmm9
921 ; SSE2-NEXT: movdqa %xmm1, %xmm6
922 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
923 ; SSE2-NEXT: paddd %xmm5, %xmm6
924 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
925 ; SSE2-NEXT: paddd %xmm11, %xmm1
926 ; SSE2-NEXT: movdqa %xmm2, %xmm5
927 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
928 ; SSE2-NEXT: paddd %xmm12, %xmm5
929 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
930 ; SSE2-NEXT: paddd %xmm10, %xmm2
931 ; SSE2-NEXT: movdqa %xmm3, %xmm4
932 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
933 ; SSE2-NEXT: paddd %xmm13, %xmm4
934 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
935 ; SSE2-NEXT: paddd %xmm8, %xmm3
936 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
937 ; SSE2-NEXT: psubd %xmm0, %xmm7
938 ; SSE2-NEXT: psubd %xmm0, %xmm9
939 ; SSE2-NEXT: psubd %xmm0, %xmm6
940 ; SSE2-NEXT: psubd %xmm0, %xmm1
941 ; SSE2-NEXT: psubd %xmm0, %xmm5
942 ; SSE2-NEXT: psubd %xmm0, %xmm2
943 ; SSE2-NEXT: psubd %xmm0, %xmm4
944 ; SSE2-NEXT: psubd %xmm0, %xmm3
945 ; SSE2-NEXT: psrld $1, %xmm3
946 ; SSE2-NEXT: psrld $1, %xmm4
947 ; SSE2-NEXT: psrld $1, %xmm2
948 ; SSE2-NEXT: psrld $1, %xmm5
949 ; SSE2-NEXT: psrld $1, %xmm1
950 ; SSE2-NEXT: psrld $1, %xmm6
951 ; SSE2-NEXT: psrld $1, %xmm9
952 ; SSE2-NEXT: psrld $1, %xmm7
953 ; SSE2-NEXT: pslld $16, %xmm7
954 ; SSE2-NEXT: psrad $16, %xmm7
955 ; SSE2-NEXT: pslld $16, %xmm9
956 ; SSE2-NEXT: psrad $16, %xmm9
957 ; SSE2-NEXT: packssdw %xmm7, %xmm9
958 ; SSE2-NEXT: pslld $16, %xmm6
959 ; SSE2-NEXT: psrad $16, %xmm6
960 ; SSE2-NEXT: pslld $16, %xmm1
961 ; SSE2-NEXT: psrad $16, %xmm1
962 ; SSE2-NEXT: packssdw %xmm6, %xmm1
963 ; SSE2-NEXT: pslld $16, %xmm5
964 ; SSE2-NEXT: psrad $16, %xmm5
965 ; SSE2-NEXT: pslld $16, %xmm2
966 ; SSE2-NEXT: psrad $16, %xmm2
967 ; SSE2-NEXT: packssdw %xmm5, %xmm2
968 ; SSE2-NEXT: pslld $16, %xmm4
969 ; SSE2-NEXT: psrad $16, %xmm4
970 ; SSE2-NEXT: pslld $16, %xmm3
971 ; SSE2-NEXT: psrad $16, %xmm3
972 ; SSE2-NEXT: packssdw %xmm4, %xmm3
973 ; SSE2-NEXT: movdqu %xmm3, (%rax)
974 ; SSE2-NEXT: movdqu %xmm2, (%rax)
975 ; SSE2-NEXT: movdqu %xmm1, (%rax)
976 ; SSE2-NEXT: movdqu %xmm9, (%rax)
977 ; SSE2-NEXT: retq
978 ;
979 ; AVX1-LABEL: avg_v32i16:
980 ; AVX1: # BB#0:
981 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
982 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
983 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
984 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
985 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
986 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
987 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
988 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
989 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
990 ; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm9
991 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
992 ; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm1
993 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
994 ; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
995 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
996 ; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3
997 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
998 ; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
999 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1000 ; AVX1-NEXT: vpaddd %xmm7, %xmm5, %xmm5
1001 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1002 ; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
1003 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1004 ; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7
1005 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1006 ; AVX1-NEXT: vpsubd %xmm0, %xmm9, %xmm8
1007 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
1008 ; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm2
1009 ; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3
1010 ; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm4
1011 ; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm5
1012 ; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm6
1013 ; AVX1-NEXT: vpsubd %xmm0, %xmm7, %xmm0
1014 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm9
1015 ; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6
1016 ; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5
1017 ; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
1018 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
1019 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
1020 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
1021 ; AVX1-NEXT: vpsrld $1, %xmm8, %xmm7
1022 ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
1023 ; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm0[1],xmm7[2],xmm0[3],xmm7[4],xmm0[5],xmm7[6],xmm0[7]
1024 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
1025 ; AVX1-NEXT: vpackusdw %xmm7, %xmm1, %xmm1
1026 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
1027 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4],xmm0[5],xmm3[6],xmm0[7]
1028 ; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
1029 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1030 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
1031 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
1032 ; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
1033 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4],xmm0[5],xmm6[6],xmm0[7]
1034 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2],xmm0[3],xmm9[4],xmm0[5],xmm9[6],xmm0[7]
1035 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1036 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1037 ; AVX1-NEXT: vmovups %ymm0, (%rax)
1038 ; AVX1-NEXT: vmovups %ymm1, (%rax)
1039 ; AVX1-NEXT: vzeroupper
1040 ; AVX1-NEXT: retq
1041 ;
1042 ; AVX2-LABEL: avg_v32i16:
1043 ; AVX2: # BB#0:
1044 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1045 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1046 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1047 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1048 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1049 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
1050 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1051 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
1052 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1053 ; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2
1054 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1055 ; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3
1056 ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
1057 ; AVX2-NEXT: vpsubd %ymm4, %ymm0, %ymm0
1058 ; AVX2-NEXT: vpsubd %ymm4, %ymm1, %ymm1
1059 ; AVX2-NEXT: vpsubd %ymm4, %ymm2, %ymm2
1060 ; AVX2-NEXT: vpsubd %ymm4, %ymm3, %ymm3
1061 ; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
1062 ; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
1063 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
1064 ; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
1065 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1066 ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1067 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1068 ; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1
1069 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1070 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1071 ; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm1
1072 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1073 ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm2
1074 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1075 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
1076 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
1077 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1078 ; AVX2-NEXT: vzeroupper
1079 ; AVX2-NEXT: retq
1080 ;
1081 ; AVX512F-LABEL: avg_v32i16:
1082 ; AVX512F: # BB#0:
1083 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1084 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1085 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1086 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
1087 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1088 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
1089 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
1090 ; AVX512F-NEXT: vpsubd %zmm2, %zmm0, %zmm0
1091 ; AVX512F-NEXT: vpsubd %zmm2, %zmm1, %zmm1
1092 ; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1
1093 ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0
1094 ; AVX512F-NEXT: vpmovdw %zmm0, (%rax)
1095 ; AVX512F-NEXT: vpmovdw %zmm1, (%rax)
1096 ; AVX512F-NEXT: vzeroupper
1097 ; AVX512F-NEXT: retq
1098 ;
1099 ; AVX512BW-LABEL: avg_v32i16:
1100 ; AVX512BW: # BB#0:
1101 ; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0
1102 ; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
1103 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
1104 ; AVX512BW-NEXT: vzeroupper
1105 ; AVX512BW-NEXT: retq
1106 %1 = load <32 x i16>, <32 x i16>* %a
1107 %2 = load <32 x i16>, <32 x i16>* %b
1108 %3 = zext <32 x i16> %1 to <32 x i32>
1109 %4 = zext <32 x i16> %2 to <32 x i32>
1110 %5 = add nuw nsw <32 x i32> %3,
1111 %6 = add nuw nsw <32 x i32> %5, %4
1112 %7 = lshr <32 x i32> %6,
1113 %8 = trunc <32 x i32> %7 to <32 x i16>
1114 store <32 x i16> %8, <32 x i16>* undef, align 4
1115 ret void
1116 }
1117
1118 define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) {
1119 ; SSE2-LABEL: avg_v4i8_2:
1120 ; SSE2: # BB#0:
1121 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1122 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1123 ; SSE2-NEXT: pavgb %xmm0, %xmm1
1124 ; SSE2-NEXT: movd %xmm1, (%rax)
1125 ; SSE2-NEXT: retq
1126 ;
1127 ; AVX-LABEL: avg_v4i8_2:
1128 ; AVX: # BB#0:
1129 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1130 ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1131 ; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
1132 ; AVX-NEXT: vmovd %xmm0, (%rax)
1133 ; AVX-NEXT: retq
1134 %1 = load <4 x i8>, <4 x i8>* %a
1135 %2 = load <4 x i8>, <4 x i8>* %b
1136 %3 = zext <4 x i8> %1 to <4 x i32>
1137 %4 = zext <4 x i8> %2 to <4 x i32>
1138 %5 = add nuw nsw <4 x i32> %3, %4
1139 %6 = add nuw nsw <4 x i32> %5,
1140 %7 = lshr <4 x i32> %6,
1141 %8 = trunc <4 x i32> %7 to <4 x i8>
1142 store <4 x i8> %8, <4 x i8>* undef, align 4
1143 ret void
1144 }
1145
1146 define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) {
1147 ; SSE2-LABEL: avg_v8i8_2:
1148 ; SSE2: # BB#0:
1149 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1150 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
1151 ; SSE2-NEXT: pavgb %xmm0, %xmm1
1152 ; SSE2-NEXT: movq %xmm1, (%rax)
1153 ; SSE2-NEXT: retq
1154 ;
1155 ; AVX-LABEL: avg_v8i8_2:
1156 ; AVX: # BB#0:
1157 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1158 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1159 ; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
1160 ; AVX-NEXT: vmovq %xmm0, (%rax)
1161 ; AVX-NEXT: retq
1162 %1 = load <8 x i8>, <8 x i8>* %a
1163 %2 = load <8 x i8>, <8 x i8>* %b
1164 %3 = zext <8 x i8> %1 to <8 x i32>
1165 %4 = zext <8 x i8> %2 to <8 x i32>
1166 %5 = add nuw nsw <8 x i32> %3, %4
1167 %6 = add nuw nsw <8 x i32> %5,
1168 %7 = lshr <8 x i32> %6,
1169 %8 = trunc <8 x i32> %7 to <8 x i8>
1170 store <8 x i8> %8, <8 x i8>* undef, align 4
1171 ret void
1172 }
1173
1174 define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
1175 ; SSE2-LABEL: avg_v16i8_2:
1176 ; SSE2: # BB#0:
1177 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1178 ; SSE2-NEXT: pavgb (%rsi), %xmm0
1179 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1180 ; SSE2-NEXT: retq
1181 ;
1182 ; AVX-LABEL: avg_v16i8_2:
1183 ; AVX: # BB#0:
1184 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1185 ; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0
1186 ; AVX-NEXT: vmovdqu %xmm0, (%rax)
1187 ; AVX-NEXT: retq
1188 %1 = load <16 x i8>, <16 x i8>* %a
1189 %2 = load <16 x i8>, <16 x i8>* %b
1190 %3 = zext <16 x i8> %1 to <16 x i32>
1191 %4 = zext <16 x i8> %2 to <16 x i32>
1192 %5 = add nuw nsw <16 x i32> %3, %4
1193 %6 = add nuw nsw <16 x i32> %5,
1194 %7 = lshr <16 x i32> %6,
1195 %8 = trunc <16 x i32> %7 to <16 x i8>
1196 store <16 x i8> %8, <16 x i8>* undef, align 4
1197 ret void
1198 }
1199
1200 define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
1201 ; SSE2-LABEL: avg_v32i8_2:
1202 ; SSE2: # BB#0:
1203 ; SSE2-NEXT: movdqa (%rdi), %xmm3
1204 ; SSE2-NEXT: movdqa 16(%rdi), %xmm8
1205 ; SSE2-NEXT: movdqa (%rsi), %xmm0
1206 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
1207 ; SSE2-NEXT: pxor %xmm4, %xmm4
1208 ; SSE2-NEXT: movdqa %xmm3, %xmm5
1209 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
1210 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1211 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
1212 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1213 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1214 ; SSE2-NEXT: movdqa %xmm3, %xmm12
1215 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
1216 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1217 ; SSE2-NEXT: movdqa %xmm8, %xmm7
1218 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
1219 ; SSE2-NEXT: movdqa %xmm7, %xmm11
1220 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
1221 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
1222 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
1223 ; SSE2-NEXT: movdqa %xmm8, %xmm10
1224 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
1225 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
1226 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1227 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
1228 ; SSE2-NEXT: movdqa %xmm2, %xmm9
1229 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
1230 ; SSE2-NEXT: paddd %xmm6, %xmm9
1231 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1232 ; SSE2-NEXT: paddd %xmm5, %xmm2
1233 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1234 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1235 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
1236 ; SSE2-NEXT: paddd %xmm12, %xmm5
1237 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1238 ; SSE2-NEXT: paddd %xmm3, %xmm0
1239 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1240 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
1241 ; SSE2-NEXT: movdqa %xmm3, %xmm6
1242 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
1243 ; SSE2-NEXT: paddd %xmm11, %xmm6
1244 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1245 ; SSE2-NEXT: paddd %xmm7, %xmm3
1246 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1247 ; SSE2-NEXT: movdqa %xmm1, %xmm7
1248 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
1249 ; SSE2-NEXT: paddd %xmm10, %xmm7
1250 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1251 ; SSE2-NEXT: paddd %xmm8, %xmm1
1252 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
1253 ; SSE2-NEXT: psubd %xmm4, %xmm9
1254 ; SSE2-NEXT: psubd %xmm4, %xmm2
1255 ; SSE2-NEXT: psubd %xmm4, %xmm5
1256 ; SSE2-NEXT: psubd %xmm4, %xmm0
1257 ; SSE2-NEXT: psubd %xmm4, %xmm6
1258 ; SSE2-NEXT: psubd %xmm4, %xmm3
1259 ; SSE2-NEXT: psubd %xmm4, %xmm7
1260 ; SSE2-NEXT: psubd %xmm4, %xmm1
1261 ; SSE2-NEXT: psrld $1, %xmm1
1262 ; SSE2-NEXT: psrld $1, %xmm7
1263 ; SSE2-NEXT: psrld $1, %xmm3
1264 ; SSE2-NEXT: psrld $1, %xmm6
1265 ; SSE2-NEXT: psrld $1, %xmm0
1266 ; SSE2-NEXT: psrld $1, %xmm5
1267 ; SSE2-NEXT: psrld $1, %xmm2
1268 ; SSE2-NEXT: psrld $1, %xmm9
1269 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1270 ; SSE2-NEXT: pand %xmm4, %xmm9
1271 ; SSE2-NEXT: pand %xmm4, %xmm2
1272 ; SSE2-NEXT: packuswb %xmm9, %xmm2
1273 ; SSE2-NEXT: pand %xmm4, %xmm5
1274 ; SSE2-NEXT: pand %xmm4, %xmm0
1275 ; SSE2-NEXT: packuswb %xmm5, %xmm0
1276 ; SSE2-NEXT: packuswb %xmm2, %xmm0
1277 ; SSE2-NEXT: pand %xmm4, %xmm6
1278 ; SSE2-NEXT: pand %xmm4, %xmm3
1279 ; SSE2-NEXT: packuswb %xmm6, %xmm3
1280 ; SSE2-NEXT: pand %xmm4, %xmm7
1281 ; SSE2-NEXT: pand %xmm4, %xmm1
1282 ; SSE2-NEXT: packuswb %xmm7, %xmm1
1283 ; SSE2-NEXT: packuswb %xmm3, %xmm1
1284 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1285 ; SSE2-NEXT: movdqu %xmm0, (%rax)
1286 ; SSE2-NEXT: retq
1287 ;
1288 ; AVX1-LABEL: avg_v32i8_2:
1289 ; AVX1: # BB#0:
1290 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1291 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1292 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1293 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1294 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1295 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1296 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1297 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1298 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1299 ; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm9
1300 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1301 ; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm1
1302 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1303 ; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
1304 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1305 ; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3
1306 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1307 ; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
1308 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1309 ; AVX1-NEXT: vpaddd %xmm7, %xmm5, %xmm5
1310 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1311 ; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
1312 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1313 ; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7
1314 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1315 ; AVX1-NEXT: vpsubd %xmm0, %xmm9, %xmm8
1316 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
1317 ; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm2
1318 ; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3
1319 ; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm4
1320 ; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm5
1321 ; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm6
1322 ; AVX1-NEXT: vpsubd %xmm0, %xmm7, %xmm0
1323 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm9
1324 ; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6
1325 ; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5
1326 ; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
1327 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
1328 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
1329 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
1330 ; AVX1-NEXT: vpsrld $1, %xmm8, %xmm7
1331 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1332 ; AVX1-NEXT: vpand %xmm0, %xmm7, %xmm7
1333 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm1
1334 ; AVX1-NEXT: vpackuswb %xmm7, %xmm1, %xmm1
1335 ; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm2
1336 ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm3
1337 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
1338 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
1339 ; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm2
1340 ; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm3
1341 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
1342 ; AVX1-NEXT: vpand %xmm0, %xmm6, %xmm3
1343 ; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0
1344 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1345 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1346 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1347 ; AVX1-NEXT: vmovups %ymm0, (%rax)
1348 ; AVX1-NEXT: vzeroupper
1349 ; AVX1-NEXT: retq
1350 ;
1351 ; AVX2-LABEL: avg_v32i8_2:
1352 ; AVX2: # BB#0:
1353 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1354 ; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0
1355 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1356 ; AVX2-NEXT: vzeroupper
1357 ; AVX2-NEXT: retq
1358 ;
1359 ; AVX512-LABEL: avg_v32i8_2:
1360 ; AVX512: # BB#0:
1361 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
1362 ; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0
1363 ; AVX512-NEXT: vmovdqu %ymm0, (%rax)
1364 ; AVX512-NEXT: vzeroupper
1365 ; AVX512-NEXT: retq
1366 %1 = load <32 x i8>, <32 x i8>* %a
1367 %2 = load <32 x i8>, <32 x i8>* %b
1368 %3 = zext <32 x i8> %1 to <32 x i32>
1369 %4 = zext <32 x i8> %2 to <32 x i32>
1370 %5 = add nuw nsw <32 x i32> %3, %4
1371 %6 = add nuw nsw <32 x i32> %5,
1372 %7 = lshr <32 x i32> %6,
1373 %8 = trunc <32 x i32> %7 to <32 x i8>
1374 store <32 x i8> %8, <32 x i8>* undef, align 4
1375 ret void
1376 }
1377
1378 define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
1379 ; SSE2-LABEL: avg_v64i8_2:
1380 ; SSE2: # BB#0:
1381 ; SSE2-NEXT: movdqa (%rsi), %xmm14
1382 ; SSE2-NEXT: movdqa 16(%rsi), %xmm12
1383 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2
1384 ; SSE2-NEXT: movdqa 48(%rsi), %xmm1
1385 ; SSE2-NEXT: pxor %xmm0, %xmm0
1386 ; SSE2-NEXT: movdqa %xmm14, %xmm7
1387 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
1388 ; SSE2-NEXT: movdqa %xmm7, %xmm15
1389 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
1390 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
1391 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
1392 ; SSE2-NEXT: movdqa %xmm14, %xmm8
1393 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
1394 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
1395 ; SSE2-NEXT: movdqa %xmm12, %xmm6
1396 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
1397 ; SSE2-NEXT: movdqa %xmm6, %xmm13
1398 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
1399 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
1400 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
1401 ; SSE2-NEXT: movdqa %xmm12, %xmm9
1402 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
1403 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
1404 ; SSE2-NEXT: movdqa %xmm2, %xmm5
1405 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
1406 ; SSE2-NEXT: movdqa %xmm5, %xmm11
1407 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
1408 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
1409 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1410 ; SSE2-NEXT: movdqa %xmm2, %xmm10
1411 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
1412 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1413 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1414 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
1415 ; SSE2-NEXT: movdqa %xmm4, %xmm3
1416 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1417 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
1418 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
1419 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1420 ; SSE2-NEXT: movdqa %xmm1, %xmm3
1421 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1422 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1423 ; SSE2-NEXT: paddd %xmm1, %xmm1
1424 ; SSE2-NEXT: paddd %xmm3, %xmm3
1425 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
1426 ; SSE2-NEXT: paddd %xmm4, %xmm4
1427 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
1428 ; SSE2-NEXT: paddd %xmm3, %xmm3
1429 ; SSE2-NEXT: paddd %xmm2, %xmm2
1430 ; SSE2-NEXT: paddd %xmm10, %xmm10
1431 ; SSE2-NEXT: paddd %xmm5, %xmm5
1432 ; SSE2-NEXT: paddd %xmm11, %xmm11
1433 ; SSE2-NEXT: paddd %xmm12, %xmm12
1434 ; SSE2-NEXT: paddd %xmm9, %xmm9
1435 ; SSE2-NEXT: paddd %xmm6, %xmm6
1436 ; SSE2-NEXT: paddd %xmm13, %xmm13
1437 ; SSE2-NEXT: paddd %xmm14, %xmm14
1438 ; SSE2-NEXT: paddd %xmm8, %xmm8
1439 ; SSE2-NEXT: paddd %xmm7, %xmm7
1440 ; SSE2-NEXT: paddd %xmm15, %xmm15
1441 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
1442 ; SSE2-NEXT: psubd %xmm0, %xmm15
1443 ; SSE2-NEXT: psubd %xmm0, %xmm7
1444 ; SSE2-NEXT: psubd %xmm0, %xmm8
1445 ; SSE2-NEXT: psubd %xmm0, %xmm14
1446 ; SSE2-NEXT: psubd %xmm0, %xmm13
1447 ; SSE2-NEXT: psubd %xmm0, %xmm6
1448 ; SSE2-NEXT: psubd %xmm0, %xmm9
1449 ; SSE2-NEXT: psubd %xmm0, %xmm12
1450 ; SSE2-NEXT: psubd %xmm0, %xmm11
1451 ; SSE2-NEXT: psubd %xmm0, %xmm5
1452 ; SSE2-NEXT: psubd %xmm0, %xmm10
1453 ; SSE2-NEXT: psubd %xmm0, %xmm2
1454 ; SSE2-NEXT: psubd %xmm0, %xmm3
1455 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
1456 ; SSE2-NEXT: psubd %xmm0, %xmm4
1457 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
1458 ; SSE2-NEXT: psubd %xmm0, %xmm3
1459 ; SSE2-NEXT: psubd %xmm0, %xmm1
1460 ; SSE2-NEXT: psrld $1, %xmm7
1461 ; SSE2-NEXT: psrld $1, %xmm15
1462 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1463 ; SSE2-NEXT: pand %xmm0, %xmm15
1464 ; SSE2-NEXT: pand %xmm0, %xmm7
1465 ; SSE2-NEXT: packuswb %xmm15, %xmm7
1466 ; SSE2-NEXT: psrld $1, %xmm14
1467 ; SSE2-NEXT: psrld $1, %xmm8
1468 ; SSE2-NEXT: pand %xmm0, %xmm8
1469 ; SSE2-NEXT: pand %xmm0, %xmm14
1470 ; SSE2-NEXT: packuswb %xmm8, %xmm14
1471 ; SSE2-NEXT: packuswb %xmm7, %xmm14
1472 ; SSE2-NEXT: psrld $1, %xmm6
1473 ; SSE2-NEXT: psrld $1, %xmm13
1474 ; SSE2-NEXT: pand %xmm0, %xmm13
1475 ; SSE2-NEXT: pand %xmm0, %xmm6
1476 ; SSE2-NEXT: packuswb %xmm13, %xmm6
1477 ; SSE2-NEXT: psrld $1, %xmm12
1478 ; SSE2-NEXT: psrld $1, %xmm9
1479 ; SSE2-NEXT: pand %xmm0, %xmm9
1480 ; SSE2-NEXT: pand %xmm0, %xmm12
1481 ; SSE2-NEXT: packuswb %xmm9, %xmm12
1482 ; SSE2-NEXT: packuswb %xmm6, %xmm12
1483 ; SSE2-NEXT: psrld $1, %xmm5
1484 ; SSE2-NEXT: psrld $1, %xmm11
1485 ; SSE2-NEXT: pand %xmm0, %xmm11
1486 ; SSE2-NEXT: pand %xmm0, %xmm5
1487 ; SSE2-NEXT: packuswb %xmm11, %xmm5
1488 ; SSE2-NEXT: psrld $1, %xmm2
1489 ; SSE2-NEXT: psrld $1, %xmm10
1490 ; SSE2-NEXT: pand %xmm0, %xmm10
1491 ; SSE2-NEXT: pand %xmm0, %xmm2
1492 ; SSE2-NEXT: packuswb %xmm10, %xmm2
1493 ; SSE2-NEXT: packuswb %xmm5, %xmm2
1494 ; SSE2-NEXT: psrld $1, %xmm4
1495 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
1496 ; SSE2-NEXT: psrld $1, %xmm5
1497 ; SSE2-NEXT: pand %xmm0, %xmm5
1498 ; SSE2-NEXT: pand %xmm0, %xmm4
1499 ; SSE2-NEXT: packuswb %xmm5, %xmm4
1500 ; SSE2-NEXT: psrld $1, %xmm1
1501 ; SSE2-NEXT: movdqa %xmm3, %xmm5
1502 ; SSE2-NEXT: psrld $1, %xmm5
1503 ; SSE2-NEXT: pand %xmm0, %xmm5
1504 ; SSE2-NEXT: pand %xmm0, %xmm1
1505 ; SSE2-NEXT: packuswb %xmm5, %xmm1
1506 ; SSE2-NEXT: packuswb %xmm4, %xmm1
1507 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1508 ; SSE2-NEXT: movdqu %xmm2, (%rax)
1509 ; SSE2-NEXT: movdqu %xmm12, (%rax)
1510 ; SSE2-NEXT: movdqu %xmm14, (%rax)
1511 ; SSE2-NEXT: retq
1512 ;
1513 ; AVX1-LABEL: avg_v64i8_2:
1514 ; AVX1: # BB#0:
1515 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1516 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1517 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1518 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1519 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1520 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1521 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1522 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1523 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1524 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1525 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1526 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1527 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1528 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1529 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1530 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1531 ; AVX1-NEXT: vpaddd %xmm7, %xmm7, %xmm7
1532 ; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
1533 ; AVX1-NEXT: vpaddd %xmm6, %xmm6, %xmm6
1534 ; AVX1-NEXT: vmovdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
1535 ; AVX1-NEXT: vpaddd %xmm5, %xmm5, %xmm6
1536 ; AVX1-NEXT: vpaddd %xmm4, %xmm4, %xmm5
1537 ; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm4
1538 ; AVX1-NEXT: vpaddd %xmm2, %xmm2, %xmm3
1539 ; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm2
1540 ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm1
1541 ; AVX1-NEXT: vpaddd %xmm15, %xmm15, %xmm15
1542 ; AVX1-NEXT: vpaddd %xmm14, %xmm14, %xmm14
1543 ; AVX1-NEXT: vpaddd %xmm13, %xmm13, %xmm13
1544 ; AVX1-NEXT: vpaddd %xmm12, %xmm12, %xmm12
1545 ; AVX1-NEXT: vpaddd %xmm11, %xmm11, %xmm11
1546 ; AVX1-NEXT: vpaddd %xmm10, %xmm10, %xmm10
1547 ; AVX1-NEXT: vpaddd %xmm9, %xmm9, %xmm9
1548 ; AVX1-NEXT: vpaddd %xmm8, %xmm8, %xmm8
1549 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1550 ; AVX1-NEXT: vpsubd %xmm0, %xmm8, %xmm7
1551 ; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
1552 ; AVX1-NEXT: vpsubd %xmm0, %xmm9, %xmm8
1553 ; AVX1-NEXT: vpsubd %xmm0, %xmm10, %xmm10
1554 ; AVX1-NEXT: vpsubd %xmm0, %xmm11, %xmm9
1555 ; AVX1-NEXT: vpsubd %xmm0, %xmm12, %xmm7
1556 ; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
1557 ; AVX1-NEXT: vpsubd %xmm0, %xmm13, %xmm11
1558 ; AVX1-NEXT: vpsubd %xmm0, %xmm14, %xmm13
1559 ; AVX1-NEXT: vpsubd %xmm0, %xmm15, %xmm12
1560 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1
1561 ; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm15
1562 ; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm2
1563 ; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm14
1564 ; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm3
1565 ; AVX1-NEXT: vmovdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
1566 ; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm5
1567 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
1568 ; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3
1569 ; AVX1-NEXT: vmovdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
1570 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
1571 ; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm0
1572 ; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
1573 ; AVX1-NEXT: vpsrld $1, %xmm8, %xmm6
1574 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
1575 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm8
1576 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1577 ; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8
1578 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
1579 ; AVX1-NEXT: vpackuswb %xmm8, %xmm6, %xmm8
1580 ; AVX1-NEXT: vpsrld $1, %xmm9, %xmm6
1581 ; AVX1-NEXT: vpsrld $1, %xmm10, %xmm4
1582 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4
1583 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
1584 ; AVX1-NEXT: vpackuswb %xmm4, %xmm6, %xmm4
1585 ; AVX1-NEXT: vpackuswb %xmm8, %xmm4, %xmm4
1586 ; AVX1-NEXT: vpsrld $1, %xmm11, %xmm6
1587 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
1588 ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm3
1589 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1590 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
1591 ; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
1592 ; AVX1-NEXT: vpsrld $1, %xmm12, %xmm6
1593 ; AVX1-NEXT: vpsrld $1, %xmm13, %xmm0
1594 ; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
1595 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
1596 ; AVX1-NEXT: vpackuswb %xmm0, %xmm6, %xmm0
1597 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
1598 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
1599 ; AVX1-NEXT: vpsrld $1, %xmm15, %xmm3
1600 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
1601 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
1602 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1603 ; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
1604 ; AVX1-NEXT: vpsrld $1, %xmm14, %xmm3
1605 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
1606 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
1607 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1608 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
1609 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
1610 ; AVX1-NEXT: vpsrld $1, %xmm5, %xmm2
1611 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
1612 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
1613 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1614 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
1615 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
1616 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
1617 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
1618 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
1619 ; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
1620 ; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4
1621 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
1622 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
1623 ; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
1624 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1625 ; AVX1-NEXT: vmovups %ymm1, (%rax)
1626 ; AVX1-NEXT: vmovups %ymm0, (%rax)
1627 ; AVX1-NEXT: vzeroupper
1628 ; AVX1-NEXT: retq
1629 ;
1630 ; AVX2-LABEL: avg_v64i8_2:
1631 ; AVX2: # BB#0:
1632 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1633 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1634 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1635 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1636 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1637 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1638 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1639 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
1640 ; AVX2-NEXT: vpaddd %ymm7, %ymm7, %ymm7
1641 ; AVX2-NEXT: vpaddd %ymm6, %ymm6, %ymm6
1642 ; AVX2-NEXT: vpaddd %ymm5, %ymm5, %ymm5
1643 ; AVX2-NEXT: vpaddd %ymm4, %ymm4, %ymm4
1644 ; AVX2-NEXT: vpaddd %ymm3, %ymm3, %ymm3
1645 ; AVX2-NEXT: vpaddd %ymm2, %ymm2, %ymm2
1646 ; AVX2-NEXT: vpaddd %ymm1, %ymm1, %ymm1
1647 ; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0
1648 ; AVX2-NEXT: vpcmpeqd %ymm8, %ymm8, %ymm8
1649 ; AVX2-NEXT: vpsubd %ymm8, %ymm0, %ymm9
1650 ; AVX2-NEXT: vpsubd %ymm8, %ymm1, %ymm10
1651 ; AVX2-NEXT: vpsubd %ymm8, %ymm2, %ymm2
1652 ; AVX2-NEXT: vpsubd %ymm8, %ymm3, %ymm3
1653 ; AVX2-NEXT: vpsubd %ymm8, %ymm4, %ymm4
1654 ; AVX2-NEXT: vpsubd %ymm8, %ymm5, %ymm5
1655 ; AVX2-NEXT: vpsubd %ymm8, %ymm6, %ymm1
1656 ; AVX2-NEXT: vpsubd %ymm8, %ymm7, %ymm0
1657 ; AVX2-NEXT: vpsrld $1, %ymm0, %ymm11
1658 ; AVX2-NEXT: vpsrld $1, %ymm1, %ymm12
1659 ; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5
1660 ; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
1661 ; AVX2-NEXT: vpsrld $1, %ymm3, %ymm6
1662 ; AVX2-NEXT: vpsrld $1, %ymm2, %ymm7
1663 ; AVX2-NEXT: vpsrld $1, %ymm10, %ymm8
1664 ; AVX2-NEXT: vpsrld $1, %ymm9, %ymm3
1665 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1666 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3
1667 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm3[0,2,2,3]
1668 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1669 ; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm0
1670 ; AVX2-NEXT: vpshufb %ymm2, %ymm8, %ymm8
1671 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
1672 ; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm1
1673 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1674 ; AVX2-NEXT: vpshufb %ymm2, %ymm7, %ymm1
1675 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1676 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1677 ; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6
1678 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
1679 ; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6
1680 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm6[0],xmm1[0]
1681 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
1682 ; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm1
1683 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1684 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1685 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm4
1686 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
1687 ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
1688 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
1689 ; AVX2-NEXT: vpshufb %ymm2, %ymm12, %ymm4
1690 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
1691 ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
1692 ; AVX2-NEXT: vpshufb %ymm2, %ymm11, %ymm2
1693 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1694 ; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1695 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
1696 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
1697 ; AVX2-NEXT: vmovdqu %ymm1, (%rax)
1698 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
1699 ; AVX2-NEXT: vzeroupper
1700 ; AVX2-NEXT: retq
1701 ;
17141702 ; AVX512F-LABEL: avg_v64i8_2:
17151703 ; AVX512F: # BB#0:
17161704 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
17171705 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
17181706 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
17191707 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
1720 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm4
1721 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm5
1722 ; AVX512F-NEXT: vpaddd %zmm5, %zmm0, %zmm0
1723 ; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm5
1724 ; AVX512F-NEXT: vpaddd %zmm5, %zmm1, %zmm1
1725 ; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm5
1726 ; AVX512F-NEXT: vpaddd %zmm5, %zmm2, %zmm2
1727 ; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm4
1728 ; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3
1708 ; AVX512F-NEXT: vpaddd %zmm3, %zmm3, %zmm3
1709 ; AVX512F-NEXT: vpaddd %zmm2, %zmm2, %zmm2
1710 ; AVX512F-NEXT: vpaddd %zmm1, %zmm1, %zmm1
1711 ; AVX512F-NEXT: vpaddd %zmm0, %zmm0, %zmm0
1712 ; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
1713 ; AVX512F-NEXT: vpsubd %zmm4, %zmm0, %zmm0
1714 ; AVX512F-NEXT: vpsubd %zmm4, %zmm1, %zmm1
1715 ; AVX512F-NEXT: vpsubd %zmm4, %zmm2, %zmm2
1716 ; AVX512F-NEXT: vpsubd %zmm4, %zmm3, %zmm3
17291717 ; AVX512F-NEXT: vpsrld $1, %zmm3, %zmm3
17301718 ; AVX512F-NEXT: vpsrld $1, %zmm2, %zmm2
17311719 ; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1
18181806 define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
18191807 ; SSE2-LABEL: avg_v16i16_2:
18201808 ; SSE2: # BB#0:
1821 ; SSE2-NEXT: movdqa (%rdi), %xmm5
1809 ; SSE2-NEXT: movdqa (%rdi), %xmm2
18221810 ; SSE2-NEXT: movdqa 16(%rdi), %xmm4
18231811 ; SSE2-NEXT: movdqa (%rsi), %xmm0
18241812 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
1825 ; SSE2-NEXT: pxor %xmm6, %xmm6
1826 ; SSE2-NEXT: movdqa %xmm5, %xmm7
1827 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
1828 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
1829 ; SSE2-NEXT: movdqa %xmm4, %xmm8
1830 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
1831 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
1813 ; SSE2-NEXT: pxor %xmm5, %xmm5
1814 ; SSE2-NEXT: movdqa %xmm2, %xmm6
1815 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1816 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
1817 ; SSE2-NEXT: movdqa %xmm4, %xmm7
1818 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
1819 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
18321820 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1833 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
1834 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
1821 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
1822 ; SSE2-NEXT: paddd %xmm6, %xmm3
1823 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
1824 ; SSE2-NEXT: paddd %xmm2, %xmm0
18351825 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1836 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
1837 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
1838 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1]
1839 ; SSE2-NEXT: paddd %xmm6, %xmm3
1840 ; SSE2-NEXT: paddd %xmm7, %xmm3
1841 ; SSE2-NEXT: paddd %xmm6, %xmm0
1842 ; SSE2-NEXT: paddd %xmm5, %xmm0
1843 ; SSE2-NEXT: paddd %xmm6, %xmm2
1844 ; SSE2-NEXT: paddd %xmm8, %xmm2
1845 ; SSE2-NEXT: paddd %xmm6, %xmm1
1826 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
1827 ; SSE2-NEXT: paddd %xmm7, %xmm2
1828 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
18461829 ; SSE2-NEXT: paddd %xmm4, %xmm1
1830 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
1831 ; SSE2-NEXT: psubd %xmm4, %xmm3
1832 ; SSE2-NEXT: psubd %xmm4, %xmm0
1833 ; SSE2-NEXT: psubd %xmm4, %xmm2
1834 ; SSE2-NEXT: psubd %xmm4, %xmm1
18471835 ; SSE2-NEXT: psrld $1, %xmm1
18481836 ; SSE2-NEXT: psrld $1, %xmm2
18491837 ; SSE2-NEXT: psrld $1, %xmm0
18671855 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
18681856 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
18691857 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1870 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1858 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
18711859 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1872 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1873 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1874 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1875 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
1876 ; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4
18771860 ; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1878 ; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm4
1861 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
18791862 ; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
1880 ; AVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm4
1863 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
18811864 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
1882 ; AVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm3
1883 ; AVX1-NEXT: vpaddd %xmm3, %xmm8, %xmm3
1865 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1866 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
1867 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
1868 ; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0
1869 ; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm1
1870 ; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2
1871 ; AVX1-NEXT: vpsubd %xmm4, %xmm3, %xmm3
18841872 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
18851873 ; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
18861874 ; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
19271915 define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
19281916 ; SSE2-LABEL: avg_v32i16_2:
19291917 ; SSE2: # BB#0:
1930 ; SSE2-NEXT: movdqa (%rdi), %xmm11
1931 ; SSE2-NEXT: movdqa 16(%rdi), %xmm10
1932 ; SSE2-NEXT: movdqa 32(%rdi), %xmm9
1933 ; SSE2-NEXT: movdqa 48(%rdi), %xmm4
1934 ; SSE2-NEXT: movdqa (%rsi), %xmm8
1918 ; SSE2-NEXT: movdqa (%rdi), %xmm4
1919 ; SSE2-NEXT: movdqa 16(%rdi), %xmm11
1920 ; SSE2-NEXT: movdqa 32(%rdi), %xmm10
1921 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8
1922 ; SSE2-NEXT: movdqa (%rsi), %xmm9
19351923 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
19361924 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2
19371925 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3
19381926 ; SSE2-NEXT: pxor %xmm0, %xmm0
1939 ; SSE2-NEXT: movdqa %xmm11, %xmm15
1940 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
1927 ; SSE2-NEXT: movdqa %xmm4, %xmm6
1928 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
1929 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
1930 ; SSE2-NEXT: movdqa %xmm11, %xmm5
1931 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
19411932 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
1942 ; SSE2-NEXT: movdqa %xmm10, %xmm14
1943 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
1933 ; SSE2-NEXT: movdqa %xmm10, %xmm12
1934 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
19441935 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
1945 ; SSE2-NEXT: movdqa %xmm9, %xmm13
1936 ; SSE2-NEXT: movdqa %xmm8, %xmm13
19461937 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
1938 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
1939 ; SSE2-NEXT: movdqa %xmm9, %xmm7
1940 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
1941 ; SSE2-NEXT: paddd %xmm6, %xmm7
19471942 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
1948 ; SSE2-NEXT: movdqa %xmm4, %xmm12
1949 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
1950 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
1951 ; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
1952 ; SSE2-NEXT: movdqa %xmm8, %xmm7
1953 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
1954 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
1943 ; SSE2-NEXT: paddd %xmm4, %xmm9
19551944 ; SSE2-NEXT: movdqa %xmm1, %xmm6
19561945 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
1946 ; SSE2-NEXT: paddd %xmm5, %xmm6
19571947 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1948 ; SSE2-NEXT: paddd %xmm11, %xmm1
19581949 ; SSE2-NEXT: movdqa %xmm2, %xmm5
19591950 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
1951 ; SSE2-NEXT: paddd %xmm12, %xmm5
19601952 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1953 ; SSE2-NEXT: paddd %xmm10, %xmm2
19611954 ; SSE2-NEXT: movdqa %xmm3, %xmm4
19621955 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
1956 ; SSE2-NEXT: paddd %xmm13, %xmm4
19631957 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1964 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
1965 ; SSE2-NEXT: paddd %xmm0, %xmm7
1966 ; SSE2-NEXT: paddd %xmm15, %xmm7
1967 ; SSE2-NEXT: paddd %xmm0, %xmm8
1968 ; SSE2-NEXT: paddd %xmm11, %xmm8
1969 ; SSE2-NEXT: paddd %xmm0, %xmm6
1970 ; SSE2-NEXT: paddd %xmm14, %xmm6
1971 ; SSE2-NEXT: paddd %xmm0, %xmm1
1972 ; SSE2-NEXT: paddd %xmm10, %xmm1
1973 ; SSE2-NEXT: paddd %xmm0, %xmm5
1974 ; SSE2-NEXT: paddd %xmm13, %xmm5
1975 ; SSE2-NEXT: paddd %xmm0, %xmm2
1976 ; SSE2-NEXT: paddd %xmm9, %xmm2
1977 ; SSE2-NEXT: paddd %xmm0, %xmm4
1978 ; SSE2-NEXT: paddd %xmm12, %xmm4
1979 ; SSE2-NEXT: paddd %xmm0, %xmm3
1980 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
1981 ; SSE2-NEXT: psrld $1, %xmm8
1958 ; SSE2-NEXT: paddd %xmm8, %xmm3
1959 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
1960 ; SSE2-NEXT: psubd %xmm0, %xmm7
1961 ; SSE2-NEXT: psubd %xmm0, %xmm9
1962 ; SSE2-NEXT: psubd %xmm0, %xmm6
1963 ; SSE2-NEXT: psubd %xmm0, %xmm1
1964 ; SSE2-NEXT: psubd %xmm0, %xmm5
1965 ; SSE2-NEXT: psubd %xmm0, %xmm2
1966 ; SSE2-NEXT: psubd %xmm0, %xmm4
1967 ; SSE2-NEXT: psubd %xmm0, %xmm3
1968 ; SSE2-NEXT: psrld $1, %xmm3
1969 ; SSE2-NEXT: psrld $1, %xmm4
1970 ; SSE2-NEXT: psrld $1, %xmm2
1971 ; SSE2-NEXT: psrld $1, %xmm5
1972 ; SSE2-NEXT: psrld $1, %xmm1
1973 ; SSE2-NEXT: psrld $1, %xmm6
1974 ; SSE2-NEXT: psrld $1, %xmm9
19821975 ; SSE2-NEXT: psrld $1, %xmm7
19831976 ; SSE2-NEXT: pslld $16, %xmm7
19841977 ; SSE2-NEXT: psrad $16, %xmm7
1985 ; SSE2-NEXT: pslld $16, %xmm8
1986 ; SSE2-NEXT: psrad $16, %xmm8
1987 ; SSE2-NEXT: packssdw %xmm7, %xmm8
1988 ; SSE2-NEXT: psrld $1, %xmm1
1989 ; SSE2-NEXT: psrld $1, %xmm6
1978 ; SSE2-NEXT: pslld $16, %xmm9
1979 ; SSE2-NEXT: psrad $16, %xmm9
1980 ; SSE2-NEXT: packssdw %xmm7, %xmm9
19901981 ; SSE2-NEXT: pslld $16, %xmm6
19911982 ; SSE2-NEXT: psrad $16, %xmm6
19921983 ; SSE2-NEXT: pslld $16, %xmm1
19931984 ; SSE2-NEXT: psrad $16, %xmm1
19941985 ; SSE2-NEXT: packssdw %xmm6, %xmm1
1995 ; SSE2-NEXT: psrld $1, %xmm2
1996 ; SSE2-NEXT: psrld $1, %xmm5
19971986 ; SSE2-NEXT: pslld $16, %xmm5
19981987 ; SSE2-NEXT: psrad $16, %xmm5
19991988 ; SSE2-NEXT: pslld $16, %xmm2
20001989 ; SSE2-NEXT: psrad $16, %xmm2
20011990 ; SSE2-NEXT: packssdw %xmm5, %xmm2
2002