llvm.org GIT mirror llvm / ecfc353
[LV] fold-tail predication should be respected even with assume_safety assume_safety implies that loads under "if's" can be safely executed speculatively (unguarded, unmasked). However this assumption holds only for the original user "if's", not those introduced by the compiler, such as the fold-tail "if" that guards us from loading beyond the original loop trip-count. Currently the combination of fold-tail and assume-safety pragmas results in ignoring the fold-tail predicate that guards the loads, generating unmasked loads. This patch fixes this behavior. Differential Revision: https://reviews.llvm.org/D66106 Reviewers: Ayal, hsaito, fhahn git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@368973 91177308-0d34-0410-b5e6-96231b3b80d8 Dorit Nuzman a month ago
5 changed file(s) with 194 addition(s) and 21 deletion(s). Raw diff Collapse all Expand all
227227 bool canVectorize(bool UseVPlanNativePath);
228228
229229 /// Return true if we can vectorize this loop while folding its tail by
230 /// masking.
231 bool canFoldTailByMasking();
230 /// masking, and mark all respective loads/stores for masking.
231 bool prepareToFoldTailByMasking();
232232
233233 /// Returns the primary induction variable.
234234 PHINode *getPrimaryInduction() { return PrimaryInduction; }
354354 bool canVectorizeOuterLoop();
355355
356356 /// Return true if all of the instructions in the block can be speculatively
357 /// executed. \p SafePtrs is a list of addresses that are known to be legal
358 /// and we know that we can read from them without segfault.
359 bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl &SafePtrs);
357 /// executed, and record the loads/stores that require masking. If's that
358 /// guard loads can be ignored under "assume safety" unless \p PreserveGuards
359 /// is true. This can happen when we introduces guards for which the original
360 /// "unguarded-loads are safe" assumption does not hold. For example, the
361 /// vectorizer's fold-tail transformation changes the loop to execute beyond
362 /// its original trip-count, under a proper guard, which should be preserved.
363 /// \p SafePtrs is a list of addresses that are known to be legal and we know
364 /// that we can read from them without segfault.
365 bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl &SafePtrs,
366 bool PreserveGuards = false);
360367
361368 /// Updates the vectorization state by adding \p Phi to the inductions list.
362369 /// This can set \p Phi as the main induction of the loop if \p Phi is a
868868 }
869869
870870 bool LoopVectorizationLegality::blockCanBePredicated(
871 BasicBlock *BB, SmallPtrSetImpl &SafePtrs) {
871 BasicBlock *BB, SmallPtrSetImpl &SafePtrs, bool PreserveGuards) {
872872 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
873873
874874 for (Instruction &I : *BB) {
887887 // !llvm.mem.parallel_loop_access implies if-conversion safety.
888888 // Otherwise, record that the load needs (real or emulated) masking
889889 // and let the cost model decide.
890 if (!IsAnnotatedParallel)
890 if (!IsAnnotatedParallel || PreserveGuards)
891891 MaskedOp.insert(LI);
892892 continue;
893893 }
11581158 return Result;
11591159 }
11601160
1161 bool LoopVectorizationLegality::canFoldTailByMasking() {
1161 bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
11621162
11631163 LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
11641164
12011201 // Check and mark all blocks for predication, including those that ordinarily
12021202 // do not need predication such as the header block.
12031203 for (BasicBlock *BB : TheLoop->blocks()) {
1204 if (!blockCanBePredicated(BB, SafePointers)) {
1204 if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) {
12051205 reportVectorizationFailure(
12061206 "Cannot fold tail by masking as required",
12071207 "control flow cannot be substituted for a select",
48524852 // found modulo the vectorization factor is not zero, try to fold the tail
48534853 // by masking.
48544854 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4855 if (Legal->canFoldTailByMasking()) {
4855 if (Legal->prepareToFoldTailByMasking()) {
48564856 FoldTailByMasking = true;
48574857 return MaxVF;
48584858 }
0 ; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -force-vector-width=8 -force-vector-interleave=1 < %s | FileCheck %s
1
2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
3 target triple = "x86_64-pc-linux-gnu"
4
5 ; Case1: With pragma predicate to force tail-folding.
6 ; All memory opertions are masked.
7 ;void fold_tail(int * restrict p, int * restrict q1, int * restrict q2, int guard) {
8 ; #pragma clang loop vectorize_predicate(enable)
9 ; for(int ix=0; ix < 1021; ++ix) {
10 ; if (ix > guard) {
11 ; p[ix] = q1[ix] + q2[ix];
12 ; }
13 ; }
14 ;}
15
16 ;CHECK-LABEL: @fold_tail
17 ;CHECK: vector.body:
18 ;CHECK: call <8 x i32> @llvm.masked.load
19 ;CHECK: call <8 x i32> @llvm.masked.load
20 ;CHECK: call void @llvm.masked.store
21
22 ; Function Attrs: nofree norecurse nounwind uwtable
23 define dso_local void @fold_tail(i32* noalias nocapture %p, i32* noalias nocapture readonly %q1, i32* noalias nocapture readonly %q2,
24 i32 %guard) local_unnamed_addr #0 {
25 entry:
26 %0 = sext i32 %guard to i64
27 br label %for.body
28
29 for.cond.cleanup:
30 ret void
31
32 for.body:
33 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
34 %cmp1 = icmp sgt i64 %indvars.iv, %0
35 br i1 %cmp1, label %if.then, label %for.inc
36
37 if.then:
38 %arrayidx = getelementptr inbounds i32, i32* %q1, i64 %indvars.iv
39 %1 = load i32, i32* %arrayidx, align 4, !tbaa !2
40 %arrayidx3 = getelementptr inbounds i32, i32* %q2, i64 %indvars.iv
41 %2 = load i32, i32* %arrayidx3, align 4, !tbaa !2
42 %add = add nsw i32 %2, %1
43 %arrayidx5 = getelementptr inbounds i32, i32* %p, i64 %indvars.iv
44 store i32 %add, i32* %arrayidx5, align 4, !tbaa !2
45 br label %for.inc
46
47 for.inc:
48 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
49 %exitcond = icmp eq i64 %indvars.iv.next, 1021
50 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !8
51 }
52
53 ; Case2: With pragma assume_safety only the store is masked.
54 ; void assume_safety(int * p, int * q1, int * q2, int guard) {
55 ; #pragma clang loop vectorize(assume_safety)
56 ; for(int ix=0; ix < 1021; ++ix) {
57 ; if (ix > guard) {
58 ; p[ix] = q1[ix] + q2[ix];
59 ; }
60 ; }
61 ;}
62
63 ;CHECK-LABEL: @assume_safety
64 ;CHECK: vector.body:
65 ;CHECK-NOT: @llvm.masked.load
66 ;CHECK: call void @llvm.masked.store
67
68 ; Function Attrs: norecurse nounwind uwtable
69 define void @assume_safety(i32* nocapture, i32* nocapture readonly, i32* nocapture readonly, i32) local_unnamed_addr #0 {
70 %5 = sext i32 %3 to i64
71 br label %7
72
73 ;
74 ret void
75
76 ;
77 %8 = phi i64 [ 0, %4 ], [ %18, %17 ]
78 %9 = icmp sgt i64 %8, %5
79 br i1 %9, label %10, label %17
80
81 ;
82 %11 = getelementptr inbounds i32, i32* %1, i64 %8
83 %12 = load i32, i32* %11, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
84 %13 = getelementptr inbounds i32, i32* %2, i64 %8
85 %14 = load i32, i32* %13, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
86 %15 = add nsw i32 %14, %12
87 %16 = getelementptr inbounds i32, i32* %0, i64 %8
88 store i32 %15, i32* %16, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
89 br label %17
90
91 ;
92 %18 = add nuw nsw i64 %8, 1
93 %19 = icmp eq i64 %18, 1021
94 br i1 %19, label %6, label %7, !llvm.loop !6
95 }
96
97 ; Case3: With pragma assume_safety and pragma predicate both the store and the
98 ; load are masked.
99 ; void fold_tail_and_assume_safety(int * p, int * q1, int * q2, int guard) {
100 ; #pragma clang loop vectorize(assume_safety) vectorize_predicate(enable)
101 ; for(int ix=0; ix < 1021; ++ix) {
102 ; if (ix > guard) {
103 ; p[ix] = q1[ix] + q2[ix];
104 ; }
105 ; }
106 ;}
107
108 ;CHECK-LABEL: @fold_tail_and_assume_safety
109 ;CHECK: vector.body:
110 ;CHECK: call <8 x i32> @llvm.masked.load
111 ;CHECK: call <8 x i32> @llvm.masked.load
112 ;CHECK: call void @llvm.masked.store
113
114 ; Function Attrs: nofree norecurse nounwind uwtable
115 define dso_local void @fold_tail_and_assume_safety(i32* noalias nocapture %p, i32* noalias nocapture readonly %q1, i32* noalias nocapture readonly %q2,
116 i32 %guard) local_unnamed_addr #0 {
117 entry:
118 %0 = sext i32 %guard to i64
119 br label %for.body
120
121 for.cond.cleanup:
122 ret void
123
124 for.body:
125 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
126 %cmp1 = icmp sgt i64 %indvars.iv, %0
127 br i1 %cmp1, label %if.then, label %for.inc
128
129 if.then:
130 %arrayidx = getelementptr inbounds i32, i32* %q1, i64 %indvars.iv
131 %1 = load i32, i32* %arrayidx, align 4, !tbaa !2, !llvm.access.group !10
132 %arrayidx3 = getelementptr inbounds i32, i32* %q2, i64 %indvars.iv
133 %2 = load i32, i32* %arrayidx3, align 4, !tbaa !2, !llvm.access.group !10
134 %add = add nsw i32 %2, %1
135 %arrayidx5 = getelementptr inbounds i32, i32* %p, i64 %indvars.iv
136 store i32 %add, i32* %arrayidx5, align 4, !tbaa !2, !llvm.access.group !10
137 br label %for.inc
138
139 for.inc:
140 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
141 %exitcond = icmp eq i64 %indvars.iv.next, 1021
142 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !11
143 }
144
145 attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
146
147 !llvm.module.flags = !{!0}
148 !llvm.ident = !{!1}
149
150 !0 = !{i32 1, !"wchar_size", i32 4}
151 !1 = !{!"clang version 6.0.0-1ubuntu2 (tags/RELEASE_600/final)"}
152 !2 = !{!3, !3, i64 0}
153 !3 = !{!"int", !4, i64 0}
154 !4 = !{!"omnipotent char", !5, i64 0}
155 !5 = !{!"Simple C/C++ TBAA"}
156 !6 = distinct !{!6, !7}
157 !7 = !{!"llvm.loop.vectorize.enable", i1 true}
158
159 !8 = distinct !{!8, !9}
160 !9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
161
162 !10 = distinct !{}
163 !11 = distinct !{!11, !12, !13}
164 !12 = !{!"llvm.loop.parallel_accesses", !10}
165 !13 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
101101 ; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]],
102102 ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
103103 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
104 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
105 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
106 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
107 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
108 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
109 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
110 ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
111 ; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
112 ; CHECK-NEXT: [[TMP8:%.*]] = icmp ule <8 x i64> [[INDUCTION]],
113 ; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
114 ; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP7]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP8]])
104 ; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[INDUCTION]],
105 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
106 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>*
107 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !6
108 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
109 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
110 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
111 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !6
112 ; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
113 ; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
114 ; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP2]])
115115 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
116116 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
117117 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !7