llvm.org GIT mirror llvm / 13da61c
[CostModel][X86] Add CTPOP scalar costs (PR43656) Add specific scalar costs for ctpop instructions, these are based on the llvm-mca's SLM throughput numbers (the oldest model we have). For targets supporting POPCNT, we provide overrides that assume 1cy costs. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@374775 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 1 year, 6 days ago
3 changed file(s) with 73 addition(s) and 26 deletion(s). Raw diff Collapse all Expand all
21022102 { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
21032103 { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
21042104 };
2105 static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
2106 { ISD::CTPOP, MVT::i64, 1 },
2107 };
2108 static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
2109 { ISD::CTPOP, MVT::i32, 1 },
2110 { ISD::CTPOP, MVT::i16, 1 },
2111 { ISD::CTPOP, MVT::i8, 1 },
2112 };
21052113 static const CostTblEntry X64CostTbl[] = { // 64-bit targets
21062114 { ISD::BITREVERSE, MVT::i64, 14 },
2115 { ISD::CTPOP, MVT::i64, 10 },
21072116 { ISD::SADDO, MVT::i64, 1 },
21082117 { ISD::UADDO, MVT::i64, 1 },
21092118 };
21112120 { ISD::BITREVERSE, MVT::i32, 14 },
21122121 { ISD::BITREVERSE, MVT::i16, 14 },
21132122 { ISD::BITREVERSE, MVT::i8, 11 },
2123 { ISD::CTPOP, MVT::i32, 8 },
2124 { ISD::CTPOP, MVT::i16, 9 },
2125 { ISD::CTPOP, MVT::i8, 7 },
21142126 { ISD::SADDO, MVT::i32, 1 },
21152127 { ISD::SADDO, MVT::i16, 1 },
21162128 { ISD::SADDO, MVT::i8, 1 },
22212233 if (ST->hasSSE1())
22222234 if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
22232235 return LT.first * Entry->Cost;
2236
2237 if (ST->hasPOPCNT()) {
2238 if (ST->is64Bit())
2239 if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
2240 return LT.first * Entry->Cost;
2241
2242 if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
2243 return LT.first * Entry->Cost;
2244 }
2245
2246 // TODO - add LZCNT and BMI (TZCNT) scalar handling
22242247
22252248 if (ST->is64Bit())
22262249 if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
1515
1616 define i64 @var_ctpop_i64(i64 %a) {
1717 ; NOPOPCNT-LABEL: 'var_ctpop_i64'
18 ; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctpop = call i64 @llvm.ctpop.i64(i64 %a)
18 ; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %ctpop = call i64 @llvm.ctpop.i64(i64 %a)
1919 ; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctpop
2020 ;
2121 ; POPCNT-LABEL: 'var_ctpop_i64'
2828
2929 define i32 @var_ctpop_i32(i32 %a) {
3030 ; NOPOPCNT-LABEL: 'var_ctpop_i32'
31 ; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctpop = call i32 @llvm.ctpop.i32(i32 %a)
31 ; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %ctpop = call i32 @llvm.ctpop.i32(i32 %a)
3232 ; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctpop
3333 ;
3434 ; POPCNT-LABEL: 'var_ctpop_i32'
4141
4242 define i16 @var_ctpop_i16(i16 %a) {
4343 ; NOPOPCNT-LABEL: 'var_ctpop_i16'
44 ; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctpop = call i16 @llvm.ctpop.i16(i16 %a)
44 ; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call i16 @llvm.ctpop.i16(i16 %a)
4545 ; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctpop
4646 ;
4747 ; POPCNT-LABEL: 'var_ctpop_i16'
5454
5555 define i8 @var_ctpop_i8(i8 %a) {
5656 ; NOPOPCNT-LABEL: 'var_ctpop_i8'
57 ; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctpop = call i8 @llvm.ctpop.i8(i8 %a)
57 ; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call i8 @llvm.ctpop.i8(i8 %a)
5858 ; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctpop
5959 ;
6060 ; POPCNT-LABEL: 'var_ctpop_i8'
2020 declare i8 @llvm.ctpop.i8(i8)
2121
2222 define void @ctpop_2i64() #0 {
23 ; CHECK-LABEL: @ctpop_2i64(
24 ; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
25 ; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
26 ; CHECK-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
27 ; CHECK-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
28 ; CHECK-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
29 ; CHECK-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
30 ; CHECK-NEXT: ret void
23 ; SSE2-LABEL: @ctpop_2i64(
24 ; SSE2-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
25 ; SSE2-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP1]])
26 ; SSE2-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
27 ; SSE2-NEXT: ret void
28 ;
29 ; SSE42-LABEL: @ctpop_2i64(
30 ; SSE42-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
31 ; SSE42-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
32 ; SSE42-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
33 ; SSE42-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
34 ; SSE42-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
35 ; SSE42-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
36 ; SSE42-NEXT: ret void
37 ;
38 ; AVX-LABEL: @ctpop_2i64(
39 ; AVX-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
40 ; AVX-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
41 ; AVX-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
42 ; AVX-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
43 ; AVX-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
44 ; AVX-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
45 ; AVX-NEXT: ret void
3146 ;
3247 %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
3348 %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
3954 }
4055
4156 define void @ctpop_4i64() #0 {
42 ; SSE-LABEL: @ctpop_4i64(
43 ; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
44 ; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
45 ; SSE-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
46 ; SSE-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
47 ; SSE-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
48 ; SSE-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
49 ; SSE-NEXT: [[CTPOP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD2]])
50 ; SSE-NEXT: [[CTPOP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD3]])
51 ; SSE-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
52 ; SSE-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
53 ; SSE-NEXT: store i64 [[CTPOP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
54 ; SSE-NEXT: store i64 [[CTPOP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
55 ; SSE-NEXT: ret void
57 ; SSE2-LABEL: @ctpop_4i64(
58 ; SSE2-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4
59 ; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4
60 ; SSE2-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP1]])
61 ; SSE2-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP2]])
62 ; SSE2-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
63 ; SSE2-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4
64 ; SSE2-NEXT: ret void
65 ;
66 ; SSE42-LABEL: @ctpop_4i64(
67 ; SSE42-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
68 ; SSE42-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
69 ; SSE42-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
70 ; SSE42-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
71 ; SSE42-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
72 ; SSE42-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
73 ; SSE42-NEXT: [[CTPOP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD2]])
74 ; SSE42-NEXT: [[CTPOP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD3]])
75 ; SSE42-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
76 ; SSE42-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
77 ; SSE42-NEXT: store i64 [[CTPOP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
78 ; SSE42-NEXT: store i64 [[CTPOP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
79 ; SSE42-NEXT: ret void
5680 ;
5781 ; AVX1-LABEL: @ctpop_4i64(
5882 ; AVX1-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4