llvm.org GIT mirror llvm / 1d63d1c
Merging r279930: ------------------------------------------------------------------------ r279930 | elena.demikhovsky | 2016-08-28 01:53:53 -0700 (Sun, 28 Aug 2016) | 7 lines [Loop Vectorizer] Fixed memory confilict checks. Fixed a bug in run-time checks for possible memory conflicts inside loop. The bug is in Low <-> High boundaries calculation. The High boundary should be calculated as "last memory access pointer + element size". Differential revision: https://reviews.llvm.org/D23176 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_39@287779 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 2 years ago
8 changed file(s) with 109 addition(s) and 30 deletion(s). Raw diff Collapse all Expand all
333333 struct PointerInfo {
334334 /// Holds the pointer value that we need to check.
335335 TrackingVH PointerValue;
336 /// Holds the pointer value at the beginning of the loop.
336 /// Holds the smallest byte address accessed by the pointer throughout all
337 /// iterations of the loop.
337338 const SCEV *Start;
338 /// Holds the pointer value at the end of the loop.
339 /// Holds the largest byte address accessed by the pointer throughout all
340 /// iterations of the loop, plus 1.
339341 const SCEV *End;
340342 /// Holds the information if this pointer is used for writing to memory.
341343 bool IsWritePtr;
147147 return OrigSCEV;
148148 }
149149
150 /// Calculate Start and End points of memory access.
151 /// Let's assume A is the first access and B is a memory access on N-th loop
152 /// iteration. Then B is calculated as:
153 /// B = A + Step*N .
154 /// Step value may be positive or negative.
155 /// N is a calculated back-edge taken count:
156 /// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
157 /// Start and End points are calculated in the following way:
158 /// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
159 /// where SizeOfElt is the size of single memory access in bytes.
160 ///
161 /// There is no conflict when the intervals are disjoint:
162 /// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
150163 void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
151164 unsigned DepSetId, unsigned ASId,
152165 const ValueToValueMap &Strides,
175188 if (CStep->getValue()->isNegative())
176189 std::swap(ScStart, ScEnd);
177190 } else {
178 // Fallback case: the step is not constant, but the we can still
191 // Fallback case: the step is not constant, but we can still
179192 // get the upper and lower bounds of the interval by using min/max
180193 // expressions.
181194 ScStart = SE->getUMinExpr(ScStart, ScEnd);
182195 ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
183196 }
197 // Add the size of the pointed element to ScEnd.
198 unsigned EltSize =
199 Ptr->getType()->getPointerElementType()->getScalarSizeInBits() / 8;
200 const SCEV *EltSizeSCEV = SE->getConstant(ScEnd->getType(), EltSize);
201 ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
184202 }
185203
186204 Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc);
18621880 Value *End0 = ChkBuilder.CreateBitCast(A.End, PtrArithTy1, "bc");
18631881 Value *End1 = ChkBuilder.CreateBitCast(B.End, PtrArithTy0, "bc");
18641882
1865 Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
1883 // [A|B].Start points to the first accessed byte under base [A|B].
1884 // [A|B].End points to the last accessed byte, plus one.
1885 // There is no conflict when the intervals are disjoint:
1886 // NoConflict = (B.Start >= A.End) || (A.Start >= B.End)
1887 //
1888 // bound0 = (B.Start < A.End)
1889 // bound1 = (A.Start < B.End)
1890 // IsConflict = bound0 & bound1
1891 Value *Cmp0 = ChkBuilder.CreateICmpULT(Start0, End1, "bound0");
18661892 FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
1867 Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
1893 Value *Cmp1 = ChkBuilder.CreateICmpULT(Start1, End0, "bound1");
18681894 FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
18691895 Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
18701896 FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
0 ; RUN: opt -analyze --loop-accesses %s | FileCheck %s
1
2 ; This test verifies run-time boundary check of memory accesses.
3 ; The original loop:
4 ; void fastCopy(const char* src, char* op) {
5 ; int len = 32;
6 ; while (len > 0) {
7 ; *(reinterpret_cast(op)) = *(reinterpret_cast(src));
8 ; src += 8;
9 ; op += 8;
10 ; len -= 8;
11 ; }
12 ; }
13 ; Boundaries calculations before this patch:
14 ; (Low: %src High: (24 + %src))
15 ; and the actual distance between two pointers was 31, (%op - %src = 31)
16 ; IsConflict = (24 > 31) = false -> execution is directed to the vectorized loop.
17 ; The loop was vectorized to 4, 32 byte memory access ( <4 x i64> ),
18 ; store a value at *%op touched memory under *%src.
19
20 ;CHECK: Printing analysis 'Loop Access Analysis' for function 'fastCopy'
21 ;CHECK: (Low: %op High: (32 + %op))
22 ;CHECK: (Low: %src High: (32 + %src))
23
24 define void @fastCopy(i8* nocapture readonly %src, i8* nocapture %op) {
25 entry:
26 br label %while.body.preheader
27
28 while.body.preheader: ; preds = %entry
29 br label %while.body
30
31 while.body: ; preds = %while.body.preheader, %while.body
32 %len.addr.07 = phi i32 [ %sub, %while.body ], [ 32, %while.body.preheader ]
33 %op.addr.06 = phi i8* [ %add.ptr1, %while.body ], [ %op, %while.body.preheader ]
34 %src.addr.05 = phi i8* [ %add.ptr, %while.body ], [ %src, %while.body.preheader ]
35 %0 = bitcast i8* %src.addr.05 to i64*
36 %1 = load i64, i64* %0, align 8
37 %2 = bitcast i8* %op.addr.06 to i64*
38 store i64 %1, i64* %2, align 8
39 %add.ptr = getelementptr inbounds i8, i8* %src.addr.05, i64 8
40 %add.ptr1 = getelementptr inbounds i8, i8* %op.addr.06, i64 8
41 %sub = add nsw i32 %len.addr.07, -8
42 %cmp = icmp sgt i32 %len.addr.07, 8
43 br i1 %cmp, label %while.body, label %while.end.loopexit
44
45 while.end.loopexit: ; preds = %while.body
46 br label %while.end
47
48 while.end: ; preds = %while.end.loopexit, %entry
49 ret void
50 }
9595 ; CHECK-NEXT: %arrayidxB = getelementptr inbounds i16, i16* %b, i64 %ind
9696 ; CHECK-NEXT: Grouped accesses:
9797 ; CHECK-NEXT: Group {{.*}}[[ZERO]]:
98 ; CHECK-NEXT: (Low: %c High: (78 + %c))
98 ; CHECK-NEXT: (Low: %c High: (80 + %c))
9999 ; CHECK-NEXT: Member: {(2 + %c),+,4}
100100 ; CHECK-NEXT: Member: {%c,+,4}
101101 ; CHECK-NEXT: Group {{.*}}[[ONE]]:
102 ; CHECK-NEXT: (Low: %a High: (40 + %a))
102 ; CHECK-NEXT: (Low: %a High: (42 + %a))
103103 ; CHECK-NEXT: Member: {(2 + %a),+,2}
104104 ; CHECK-NEXT: Member: {%a,+,2}
105105 ; CHECK-NEXT: Group {{.*}}[[TWO]]:
106 ; CHECK-NEXT: (Low: %b High: (38 + %b))
106 ; CHECK-NEXT: (Low: %b High: (40 + %b))
107107 ; CHECK-NEXT: Member: {%b,+,2}
108108
109109 define void @testg(i16* %a,
167167 ; CHECK-NEXT: %arrayidxB = getelementptr i16, i16* %b, i64 %ind
168168 ; CHECK-NEXT: Grouped accesses:
169169 ; CHECK-NEXT: Group {{.*}}[[ZERO]]:
170 ; CHECK-NEXT: (Low: %c High: (78 + %c))
170 ; CHECK-NEXT: (Low: %c High: (80 + %c))
171171 ; CHECK-NEXT: Member: {(2 + %c),+,4}
172172 ; CHECK-NEXT: Member: {%c,+,4}
173173 ; CHECK-NEXT: Group {{.*}}[[ONE]]:
174 ; CHECK-NEXT: (Low: %a High: (40 + %a))
174 ; CHECK-NEXT: (Low: %a High: (42 + %a))
175175 ; CHECK-NEXT: Member: {(2 + %a),+,2}
176176 ; CHECK-NEXT: Member: {%a,+,2}
177177 ; CHECK-NEXT: Group {{.*}}[[TWO]]:
178 ; CHECK-NEXT: (Low: %b High: (38 + %b))
178 ; CHECK-NEXT: (Low: %b High: (40 + %b))
179179 ; CHECK-NEXT: Member: {%b,+,2}
180180
181181 define void @testh(i16* %a,
246246 ; CHECK-NEXT: %arrayidxA2 = getelementptr i16, i16* %a, i64 %ind2
247247 ; CHECK-NEXT: Grouped accesses:
248248 ; CHECK-NEXT: Group {{.*}}[[ZERO]]:
249 ; CHECK-NEXT: (Low: ((2 * %offset) + %a) High: (9998 + (2 * %offset) + %a))
249 ; CHECK-NEXT: (Low: ((2 * %offset) + %a) High: (10000 + (2 * %offset) + %a))
250250 ; CHECK-NEXT: Member: {((2 * %offset) + %a),+,2}<%for.body>
251251 ; CHECK-NEXT: Group {{.*}}[[ONE]]:
252 ; CHECK-NEXT: (Low: %a High: (9998 + %a))
252 ; CHECK-NEXT: (Low: %a High: (10000 + %a))
253253 ; CHECK-NEXT: Member: {%a,+,2}<%for.body>
254254 ; CHECK-NEXT: Group {{.*}}[[TWO]]:
255 ; CHECK-NEXT: (Low: (20000 + %a) High: (29998 + %a))
255 ; CHECK-NEXT: (Low: (20000 + %a) High: (30000 + %a))
256256 ; CHECK-NEXT: Member: {(20000 + %a),+,2}<%for.body>
257257
258258 define void @testi(i16* %a,
1515 target triple = "aarch64--linux-gnueabi"
1616
1717 ; CHECK: function 'f':
18 ; CHECK: (Low: (20000 + %a) High: (60000 + %a))
18 ; CHECK: (Low: (20000 + %a) High: (60004 + %a))
1919
2020 @B = common global i32* null, align 8
2121 @A = common global i32* null, align 8
5858 ; Here it is not obvious what the limits are, since 'step' could be negative.
5959
6060 ; CHECK: Low: (-1 + (-1 * ((-60001 + (-1 * %a)) umax (-60001 + (40000 * %step) + (-1 * %a)))))
61 ; CHECK: High: ((60000 + %a) umax (60000 + (-40000 * %step) + %a))
61 ; CHECK: High: (4 + ((60000 + %a) umax (60000 + (-40000 * %step) + %a)))
6262
6363 define void @g(i64 %step) {
6464 entry:
77 ;CHECK: br
88 ;CHECK: getelementptr
99 ;CHECK-DAG: getelementptr
10 ;CHECK-DAG: icmp uge
11 ;CHECK-DAG: icmp uge
12 ;CHECK-DAG: icmp uge
13 ;CHECK-DAG: icmp uge
10 ;CHECK-DAG: icmp ugt
11 ;CHECK-DAG: icmp ugt
12 ;CHECK-DAG: icmp ugt
13 ;CHECK-DAG: icmp ugt
1414 ;CHECK-DAG: and
1515 ;CHECK-DAG: and
1616 ;CHECK: br
3535 ; CHECK: ret i32 0
3636
3737 ; CHECK-NOTBAA-LABEL: @test1
38 ; CHECK-NOTBAA: icmp uge i32*
38 ; CHECK-NOTBAA: icmp ugt i32*
3939
4040 ; CHECK-NOTBAA: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
4141 ; CHECK-NOTBAA: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa
6969 ; required. Without TBAA, however, two checks are required.
7070
7171 ; CHECK-LABEL: @test2
72 ; CHECK: icmp uge float*
73 ; CHECK: icmp uge float*
72 ; CHECK: icmp ugt float*
73 ; CHECK: icmp ugt float*
7474 ; CHECK-NOT: icmp uge i32*
7575
7676 ; CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
7979 ; CHECK: ret i32 0
8080
8181 ; CHECK-NOTBAA-LABEL: @test2
82 ; CHECK-NOTBAA: icmp uge float*
83 ; CHECK-NOTBAA: icmp uge float*
84 ; CHECK-NOTBAA-DAG: icmp uge float*
85 ; CHECK-NOTBAA-DAG: icmp uge i32*
82 ; CHECK-NOTBAA: icmp ugt float*
83 ; CHECK-NOTBAA: icmp ugt float*
84 ; CHECK-NOTBAA-DAG: icmp ugt float*
85 ; CHECK-NOTBAA-DAG: icmp ugt i32*
8686
8787 ; CHECK-NOTBAA: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
8888 ; CHECK-NOTBAA: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 4, !tbaa
77 ; CHECK-NEXT: Loop Versioning found to be beneficial
88 ;
99 ; CHECK: for.body3:
10 ; CHECK-NEXT: %add86 = phi i32 [ %arrayidx7.promoted, %for.body3.ph ], [ %add8, %for.body3 ]
10 ; CHECK-NEXT: %[[induction:.*]] = phi i32 [ %arrayidx7.promoted, %for.body3.ph ], [ %add8, %for.body3 ]
1111 ; CHECK-NEXT: %j.113 = phi i32 [ %j.016, %for.body3.ph ], [ %inc, %for.body3 ]
1212 ; CHECK-NEXT: %idxprom = zext i32 %j.113 to i64
1313 ; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, i32* %var1, i64 %idxprom
1414 ; CHECK-NEXT: store i32 %add, i32* %arrayidx, align 4, !alias.scope !6, !noalias !6
15 ; CHECK-NEXT: %add8 = add nsw i32 %add86, %add
15 ; CHECK-NEXT: %add8 = add nsw i32 %[[induction]], %add
1616 ; CHECK-NEXT: %inc = add nuw i32 %j.113, 1
1717 ; CHECK-NEXT: %cmp2 = icmp ult i32 %inc, %itr
18 ; CHECK-NEXT: br i1 %cmp2, label %for.body3, label %for.inc11.loopexit.loopexit5, !llvm.loop !7
18 ; CHECK-NEXT: br i1 %cmp2, label %for.body3, label %for.inc11.loopexit.loopexit6, !llvm.loop !7
1919 define i32 @foo(i32* nocapture %var1, i32* nocapture readnone %var2, i32* nocapture %var3, i32 %itr) #0 {
2020 entry:
2121 %cmp14 = icmp eq i32 %itr, 0