llvm.org GIT mirror llvm / 7cb9be5
[CostModel][X86] Add CTLZ scalar costs Add specific scalar costs for CTLZ instructions, we can't discriminate between CTLZ and CTLZ_ZERO_UNDEF so we have to assume the worst. Given how BSR is often a microcoded nightmare on some older targets we might still be underestimating it. For targets supporting LZCNT (Intel Haswell+ or AMD Fam10+), we provide overrides that assume 1cy costs. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@374786 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 1 year, 6 days ago
3 changed file(s) with 250 addition(s) and 178 deletion(s). Raw diff Collapse all Expand all
21022102 { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
21032103 { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
21042104 };
2105 static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
2106 { ISD::CTLZ, MVT::i64, 1 },
2107 };
2108 static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
2109 { ISD::CTLZ, MVT::i32, 1 },
2110 { ISD::CTLZ, MVT::i16, 1 },
2111 { ISD::CTLZ, MVT::i8, 1 },
2112 };
21052113 static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
21062114 { ISD::CTPOP, MVT::i64, 1 },
21072115 };
21122120 };
21132121 static const CostTblEntry X64CostTbl[] = { // 64-bit targets
21142122 { ISD::BITREVERSE, MVT::i64, 14 },
2123 { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
21152124 { ISD::CTPOP, MVT::i64, 10 },
21162125 { ISD::SADDO, MVT::i64, 1 },
21172126 { ISD::UADDO, MVT::i64, 1 },
21202129 { ISD::BITREVERSE, MVT::i32, 14 },
21212130 { ISD::BITREVERSE, MVT::i16, 14 },
21222131 { ISD::BITREVERSE, MVT::i8, 11 },
2132 { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
2133 { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
2134 { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
21232135 { ISD::CTPOP, MVT::i32, 8 },
21242136 { ISD::CTPOP, MVT::i16, 9 },
21252137 { ISD::CTPOP, MVT::i8, 7 },
22342246 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
22352247 return LT.first * Entry->Cost;
22362248
2249 if (ST->hasLZCNT()) {
2250 if (ST->is64Bit())
2251 if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
2252 return LT.first * Entry->Cost;
2253
2254 if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
2255 return LT.first * Entry->Cost;
2256 }
2257
22372258 if (ST->hasPOPCNT()) {
22382259 if (ST->is64Bit())
22392260 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
22432264 return LT.first * Entry->Cost;
22442265 }
22452266
2246 // TODO - add LZCNT and BMI (TZCNT) scalar handling
2267 // TODO - add BMI (TZCNT) scalar handling
22472268
22482269 if (ST->is64Bit())
22492270 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
0 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
1 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE2
2 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse4.2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE42
3 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx | FileCheck %s -check-prefixes=CHECK,AVX,AVX1
4 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx2 | FileCheck %s -check-prefixes=CHECK,AVX,AVX2
5 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512F
6 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512BW
7 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512vl,+avx512bw,+avx512dq,+avx512cd | FileCheck %s -check-prefixes=CHECK,AVX512CD
1 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=-lzcnt,+sse2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE2,NOLZCNT
2 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+lzcnt,+sse2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE2,LZCNT
3 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+lzcnt,+sse4.2 | FileCheck %s -check-prefixes=CHECK,LZCNT,SSE,SSE42
4 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+lzcnt,+avx | FileCheck %s -check-prefixes=CHECK,LZCNT,AVX,AVX1
5 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+lzcnt,+avx2 | FileCheck %s -check-prefixes=CHECK,LZCNT,AVX,AVX2
6 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+lzcnt,+avx512f | FileCheck %s -check-prefixes=CHECK,LZCNT,AVX512,AVX512F
7 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+lzcnt,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=CHECK,LZCNT,AVX512,AVX512BW
8 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+lzcnt,+avx512vl,+avx512bw,+avx512dq,+avx512cd | FileCheck %s -check-prefixes=CHECK,LZCNT,AVX512CD
89
910 ; Verify the cost of scalar leading zero count instructions.
1011
1415 declare i8 @llvm.ctlz.i8(i8, i1)
1516
1617 define i64 @var_ctlz_i64(i64 %a) {
17 ; CHECK-LABEL: 'var_ctlz_i64'
18 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
19 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz
18 ; NOLZCNT-LABEL: 'var_ctlz_i64'
19 ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
20 ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz
21 ;
22 ; LZCNT-LABEL: 'var_ctlz_i64'
23 ; LZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
24 ; LZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz
2025 ;
2126 %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 0)
2227 ret i64 %ctlz
2328 }
2429
2530 define i64 @var_ctlz_i64u(i64 %a) {
26 ; CHECK-LABEL: 'var_ctlz_i64u'
27 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 true)
28 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz
31 ; NOLZCNT-LABEL: 'var_ctlz_i64u'
32 ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 true)
33 ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz
34 ;
35 ; LZCNT-LABEL: 'var_ctlz_i64u'
36 ; LZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 true)
37 ; LZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz
2938 ;
3039 %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 1)
3140 ret i64 %ctlz
3241 }
3342
3443 define i32 @var_ctlz_i32(i32 %a) {
35 ; CHECK-LABEL: 'var_ctlz_i32'
36 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
37 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz
44 ; NOLZCNT-LABEL: 'var_ctlz_i32'
45 ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
46 ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz
47 ;
48 ; LZCNT-LABEL: 'var_ctlz_i32'
49 ; LZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
50 ; LZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz
3851 ;
3952 %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 0)
4053 ret i32 %ctlz
4154 }
4255
4356 define i32 @var_ctlz_i32u(i32 %a) {
44 ; CHECK-LABEL: 'var_ctlz_i32u'
45 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
46 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz
57 ; NOLZCNT-LABEL: 'var_ctlz_i32u'
58 ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
59 ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz
60 ;
61 ; LZCNT-LABEL: 'var_ctlz_i32u'
62 ; LZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
63 ; LZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz
4764 ;
4865 %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 1)
4966 ret i32 %ctlz
5067 }
5168
5269 define i16 @var_ctlz_i16(i16 %a) {
53 ; CHECK-LABEL: 'var_ctlz_i16'
54 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
55 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctlz
70 ; NOLZCNT-LABEL: 'var_ctlz_i16'
71 ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
72 ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctlz
73 ;
74 ; LZCNT-LABEL: 'var_ctlz_i16'
75 ; LZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
76 ; LZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctlz
5677 ;
5778 %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 0)
5879 ret i16 %ctlz
5980 }
6081
6182 define i16 @var_ctlz_i16u(i16 %a) {
62 ; CHECK-LABEL: 'var_ctlz_i16u'
63 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 true)
64 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctlz
83 ; NOLZCNT-LABEL: 'var_ctlz_i16u'
84 ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 true)
85 ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctlz
86 ;
87 ; LZCNT-LABEL: 'var_ctlz_i16u'
88 ; LZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 true)
89 ; LZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctlz
6590 ;
6691 %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 1)
6792 ret i16 %ctlz
6893 }
6994
7095 define i8 @var_ctlz_i8(i8 %a) {
71 ; CHECK-LABEL: 'var_ctlz_i8'
72 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
73 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctlz
96 ; NOLZCNT-LABEL: 'var_ctlz_i8'
97 ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
98 ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctlz
99 ;
100 ; LZCNT-LABEL: 'var_ctlz_i8'
101 ; LZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
102 ; LZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctlz
74103 ;
75104 %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 0)
76105 ret i8 %ctlz
77106 }
78107
79108 define i8 @var_ctlz_i8u(i8 %a) {
80 ; CHECK-LABEL: 'var_ctlz_i8u'
81 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 true)
82 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctlz
109 ; NOLZCNT-LABEL: 'var_ctlz_i8u'
110 ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 true)
111 ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctlz
112 ;
113 ; LZCNT-LABEL: 'var_ctlz_i8u'
114 ; LZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 true)
115 ; LZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctlz
83116 ;
84117 %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 1)
85118 ret i8 %ctlz
7474 }
7575
7676 define void @ctlz_4i32() #0 {
77 ; CHECK-LABEL: @ctlz_4i32(
78 ; CHECK-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
79 ; CHECK-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
80 ; CHECK-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
81 ; CHECK-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
82 ; CHECK-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
83 ; CHECK-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
84 ; CHECK-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
85 ; CHECK-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
86 ; CHECK-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
87 ; CHECK-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
88 ; CHECK-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
89 ; CHECK-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
90 ; CHECK-NEXT: ret void
77 ; SSE2-LABEL: @ctlz_4i32(
78 ; SSE2-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
79 ; SSE2-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
80 ; SSE2-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
81 ; SSE2-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
82 ; SSE2-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
83 ; SSE2-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
84 ; SSE2-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
85 ; SSE2-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
86 ; SSE2-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
87 ; SSE2-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
88 ; SSE2-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
89 ; SSE2-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
90 ; SSE2-NEXT: ret void
91 ;
92 ; SSE42-LABEL: @ctlz_4i32(
93 ; SSE42-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
94 ; SSE42-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
95 ; SSE42-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
96 ; SSE42-NEXT: ret void
97 ;
98 ; AVX1-LABEL: @ctlz_4i32(
99 ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
100 ; AVX1-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
101 ; AVX1-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
102 ; AVX1-NEXT: ret void
103 ;
104 ; AVX2-LABEL: @ctlz_4i32(
105 ; AVX2-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
106 ; AVX2-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
107 ; AVX2-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
108 ; AVX2-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
109 ; AVX2-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
110 ; AVX2-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
111 ; AVX2-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
112 ; AVX2-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
113 ; AVX2-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
114 ; AVX2-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
115 ; AVX2-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
116 ; AVX2-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
117 ; AVX2-NEXT: ret void
91118 ;
92119 %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
93120 %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
105132 }
106133
107134 define void @ctlz_8i32() #0 {
108 ; SSE-LABEL: @ctlz_8i32(
109 ; SSE-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
110 ; SSE-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
111 ; SSE-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
112 ; SSE-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
113 ; SSE-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
114 ; SSE-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
115 ; SSE-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
116 ; SSE-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
117 ; SSE-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
118 ; SSE-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
119 ; SSE-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
120 ; SSE-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
121 ; SSE-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false)
122 ; SSE-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false)
123 ; SSE-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false)
124 ; SSE-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false)
125 ; SSE-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
126 ; SSE-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
127 ; SSE-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
128 ; SSE-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
129 ; SSE-NEXT: store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
130 ; SSE-NEXT: store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
131 ; SSE-NEXT: store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
132 ; SSE-NEXT: store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
133 ; SSE-NEXT: ret void
134 ;
135 ; AVX1-LABEL: @ctlz_8i32(
136 ; AVX1-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
137 ; AVX1-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
138 ; AVX1-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
139 ; AVX1-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
140 ; AVX1-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
141 ; AVX1-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
142 ; AVX1-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
143 ; AVX1-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
144 ; AVX1-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
145 ; AVX1-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
146 ; AVX1-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
147 ; AVX1-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
148 ; AVX1-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false)
149 ; AVX1-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false)
150 ; AVX1-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false)
151 ; AVX1-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false)
152 ; AVX1-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
153 ; AVX1-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
154 ; AVX1-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
155 ; AVX1-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
156 ; AVX1-NEXT: store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
157 ; AVX1-NEXT: store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
158 ; AVX1-NEXT: store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
159 ; AVX1-NEXT: store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
160 ; AVX1-NEXT: ret void
161 ;
162 ; AVX2-LABEL: @ctlz_8i32(
163 ; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
164 ; AVX2-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false)
165 ; AVX2-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
166 ; AVX2-NEXT: ret void
135 ; SSE2-LABEL: @ctlz_8i32(
136 ; SSE2-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
137 ; SSE2-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
138 ; SSE2-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
139 ; SSE2-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
140 ; SSE2-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
141 ; SSE2-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
142 ; SSE2-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
143 ; SSE2-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
144 ; SSE2-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
145 ; SSE2-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
146 ; SSE2-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
147 ; SSE2-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
148 ; SSE2-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false)
149 ; SSE2-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false)
150 ; SSE2-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false)
151 ; SSE2-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false)
152 ; SSE2-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
153 ; SSE2-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
154 ; SSE2-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
155 ; SSE2-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
156 ; SSE2-NEXT: store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
157 ; SSE2-NEXT: store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
158 ; SSE2-NEXT: store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
159 ; SSE2-NEXT: store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
160 ; SSE2-NEXT: ret void
161 ;
162 ; SSE42-LABEL: @ctlz_8i32(
163 ; SSE42-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
164 ; SSE42-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
165 ; SSE42-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
166 ; SSE42-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP2]], i1 false)
167 ; SSE42-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
168 ; SSE42-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
169 ; SSE42-NEXT: ret void
170 ;
171 ; AVX-LABEL: @ctlz_8i32(
172 ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
173 ; AVX-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false)
174 ; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
175 ; AVX-NEXT: ret void
167176 ;
168177 %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
169178 %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
515524 }
516525
517526 define void @ctlz_undef_4i32() #0 {
518 ; CHECK-LABEL: @ctlz_undef_4i32(
519 ; CHECK-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
520 ; CHECK-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
521 ; CHECK-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
522 ; CHECK-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
523 ; CHECK-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true)
524 ; CHECK-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
525 ; CHECK-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
526 ; CHECK-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
527 ; CHECK-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
528 ; CHECK-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
529 ; CHECK-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
530 ; CHECK-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
531 ; CHECK-NEXT: ret void
527 ; SSE2-LABEL: @ctlz_undef_4i32(
528 ; SSE2-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
529 ; SSE2-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
530 ; SSE2-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
531 ; SSE2-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
532 ; SSE2-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true)
533 ; SSE2-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
534 ; SSE2-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
535 ; SSE2-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
536 ; SSE2-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
537 ; SSE2-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
538 ; SSE2-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
539 ; SSE2-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
540 ; SSE2-NEXT: ret void
541 ;
542 ; SSE42-LABEL: @ctlz_undef_4i32(
543 ; SSE42-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
544 ; SSE42-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true)
545 ; SSE42-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
546 ; SSE42-NEXT: ret void
547 ;
548 ; AVX1-LABEL: @ctlz_undef_4i32(
549 ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
550 ; AVX1-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true)
551 ; AVX1-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
552 ; AVX1-NEXT: ret void
553 ;
554 ; AVX2-LABEL: @ctlz_undef_4i32(
555 ; AVX2-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
556 ; AVX2-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
557 ; AVX2-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
558 ; AVX2-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
559 ; AVX2-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true)
560 ; AVX2-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
561 ; AVX2-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
562 ; AVX2-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
563 ; AVX2-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
564 ; AVX2-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
565 ; AVX2-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
566 ; AVX2-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
567 ; AVX2-NEXT: ret void
532568 ;
533569 %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
534570 %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
546582 }
547583
548584 define void @ctlz_undef_8i32() #0 {
549 ; SSE-LABEL: @ctlz_undef_8i32(
550 ; SSE-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
551 ; SSE-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
552 ; SSE-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
553 ; SSE-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
554 ; SSE-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
555 ; SSE-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
556 ; SSE-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
557 ; SSE-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
558 ; SSE-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true)
559 ; SSE-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
560 ; SSE-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
561 ; SSE-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
562 ; SSE-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 true)
563 ; SSE-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 true)
564 ; SSE-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 true)
565 ; SSE-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 true)
566 ; SSE-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
567 ; SSE-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
568 ; SSE-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
569 ; SSE-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
570 ; SSE-NEXT: store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
571 ; SSE-NEXT: store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
572 ; SSE-NEXT: store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
573 ; SSE-NEXT: store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
574 ; SSE-NEXT: ret void
575 ;
576 ; AVX1-LABEL: @ctlz_undef_8i32(
577 ; AVX1-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
578 ; AVX1-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
579 ; AVX1-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
580 ; AVX1-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
581 ; AVX1-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
582 ; AVX1-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
583 ; AVX1-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
584 ; AVX1-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
585 ; AVX1-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true)
586 ; AVX1-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
587 ; AVX1-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
588 ; AVX1-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
589 ; AVX1-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 true)
590 ; AVX1-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 true)
591 ; AVX1-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 true)
592 ; AVX1-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 true)
593 ; AVX1-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
594 ; AVX1-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
595 ; AVX1-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
596 ; AVX1-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
597 ; AVX1-NEXT: store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
598 ; AVX1-NEXT: store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
599 ; AVX1-NEXT: store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
600 ; AVX1-NEXT: store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
601 ; AVX1-NEXT: ret void
602 ;
603 ; AVX2-LABEL: @ctlz_undef_8i32(
604 ; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
605 ; AVX2-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 true)
606 ; AVX2-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
607 ; AVX2-NEXT: ret void
585 ; SSE2-LABEL: @ctlz_undef_8i32(
586 ; SSE2-NEXT: [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
587 ; SSE2-NEXT: [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
588 ; SSE2-NEXT: [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
589 ; SSE2-NEXT: [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
590 ; SSE2-NEXT: [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
591 ; SSE2-NEXT: [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
592 ; SSE2-NEXT: [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
593 ; SSE2-NEXT: [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
594 ; SSE2-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true)
595 ; SSE2-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
596 ; SSE2-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
597 ; SSE2-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
598 ; SSE2-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 true)
599 ; SSE2-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 true)
600 ; SSE2-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 true)
601 ; SSE2-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 true)
602 ; SSE2-NEXT: store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
603 ; SSE2-NEXT: store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
604 ; SSE2-NEXT: store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
605 ; SSE2-NEXT: store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
606 ; SSE2-NEXT: store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
607 ; SSE2-NEXT: store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
608 ; SSE2-NEXT: store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
609 ; SSE2-NEXT: store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
610 ; SSE2-NEXT: ret void
611 ;
612 ; SSE42-LABEL: @ctlz_undef_8i32(
613 ; SSE42-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
614 ; SSE42-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
615 ; SSE42-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true)
616 ; SSE42-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP2]], i1 true)
617 ; SSE42-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
618 ; SSE42-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
619 ; SSE42-NEXT: ret void
620 ;
621 ; AVX-LABEL: @ctlz_undef_8i32(
622 ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
623 ; AVX-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 true)
624 ; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
625 ; AVX-NEXT: ret void
608626 ;
609627 %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
610628 %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2