llvm.org GIT mirror llvm / 88b5ce0
[DSE] Teach the pass about partial overwrite of atomic memory intrinsics Summary: This change teaches DSE that the atomic memory intrinsics can be overwriten partially in the same way as the non-atomic forms. Specifically, that the atomic memcpy & memset can be shortened at the end and that the atomic memset can be shortened at the beginning, if they partially overwritten by later stores. Reviewers: mkazantsev, skatkov, apilipenko, efriedma, rsmith, spatel, filcab, sanjoy Reviewed By: efriedma Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D45584 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@331991 91177308-0d34-0410-b5e6-96231b3b80d8 Daniel Neilson 1 year, 4 months ago
3 changed file(s) with 41 addition(s) and 26 deletion(s). Raw diff Collapse all Expand all
277277 default: return false;
278278 case Intrinsic::memset:
279279 case Intrinsic::memcpy:
280 case Intrinsic::memcpy_element_unordered_atomic:
281 case Intrinsic::memset_element_unordered_atomic:
280282 // Do shorten memory intrinsics.
281283 // FIXME: Add memmove if it's also safe to transform.
282 // TODO: Add atomic memcpy/memset
283284 return true;
284285 }
285286 }
294295 static bool isShortenableAtTheBeginning(Instruction *I) {
295296 // FIXME: Handle only memset for now. Supporting memcpy/memmove should be
296297 // easily done by offsetting the source address.
297 // TODO: Handle atomic memory intrinsics
298 IntrinsicInst *II = dyn_cast(I);
299 return II && II->getIntrinsicID() == Intrinsic::memset;
298 return isa(I);
300299 }
301300
302301 /// Return the pointer that is being written to.
896895 // Power of 2 vector writes are probably always a bad idea to optimize
897896 // as any store/memset/memcpy is likely using vector instructions so
898897 // shortening it to not vector size is likely to be slower
899 MemIntrinsic *EarlierIntrinsic = cast<MemIntrinsic>(EarlierWrite);
898 auto *EarlierIntrinsic = castMemIntrinsic>(EarlierWrite);
900899 unsigned EarlierWriteAlign = EarlierIntrinsic->getDestAlignment();
901900 if (!IsOverwriteEnd)
902901 LaterOffset = int64_t(LaterOffset + LaterSize);
904903 if (!(isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) &&
905904 !((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0))
906905 return false;
906
907 int64_t NewLength = IsOverwriteEnd
908 ? LaterOffset - EarlierOffset
909 : EarlierSize - (LaterOffset - EarlierOffset);
910
911 if (auto *AMI = dyn_cast(EarlierWrite)) {
912 // When shortening an atomic memory intrinsic, the newly shortened
913 // length must remain an integer multiple of the element size.
914 const uint32_t ElementSize = AMI->getElementSizeInBytes();
915 if (0 != NewLength % ElementSize)
916 return false;
917 }
907918
908919 DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW "
909920 << (IsOverwriteEnd ? "END" : "BEGIN") << ": " << *EarlierWrite
910921 << "\n KILLER (offset " << LaterOffset << ", " << EarlierSize
911922 << ")\n");
912
913 int64_t NewLength = IsOverwriteEnd
914 ? LaterOffset - EarlierOffset
915 : EarlierSize - (LaterOffset - EarlierOffset);
916923
917924 Value *EarlierWriteLength = EarlierIntrinsic->getLength();
918925 Value *TrimmedLength =
2525 ; CHECK-NEXT: entry:
2626 ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
2727 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
28 ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4)
28 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
29 ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4)
2930 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
3031 ; CHECK-NEXT: store atomic i32 1, i32* [[ARRAYIDX1]] unordered, align 4
3132 ; CHECK-NEXT: ret void
5960 ; CHECK-LABEL: @write0to3_atomic(
6061 ; CHECK-NEXT: entry:
6162 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8*
62 ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4)
63 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
64 ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4)
6365 ; CHECK-NEXT: store atomic i32 1, i32* [[P]] unordered, align 4
6466 ; CHECK-NEXT: ret void
6567 ;
7577 ; CHECK-LABEL: @write0to3_atomic_weaker(
7678 ; CHECK-NEXT: entry:
7779 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8*
78 ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4)
80 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
81 ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4)
7982 ; CHECK-NEXT: store i32 1, i32* [[P]], align 4
8083 ; CHECK-NEXT: ret void
8184 ;
110113 ; CHECK-LABEL: @write0to7_atomic(
111114 ; CHECK-NEXT: entry:
112115 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8*
113 ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i32 4)
116 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 8
117 ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4)
114118 ; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i64*
115119 ; CHECK-NEXT: store atomic i64 1, i64* [[P4]] unordered, align 8
116120 ; CHECK-NEXT: ret void
148152 ; CHECK-NEXT: entry:
149153 ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
150154 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
151 ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4)
155 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4
156 ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i32 4)
152157 ; CHECK-NEXT: [[P4:%.*]] = bitcast i32* [[P]] to i64*
153158 ; CHECK-NEXT: store atomic i64 1, i64* [[P4]] unordered, align 8
154159 ; CHECK-NEXT: ret void
306311 ; CHECK-NEXT: entry:
307312 ; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8*
308313 ; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0
309 ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8)
314 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[MYBASE0]], i64 16
315 ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 16, i32 8)
310316 ; CHECK-NEXT: [[BASE64_0:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 0
311317 ; CHECK-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 1
312318 ; CHECK-NEXT: store atomic i64 1, i64* [[BASE64_1]] unordered, align 8
332338 ; CHECK-NEXT: entry:
333339 ; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8*
334340 ; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0
335 ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8)
341 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[MYBASE0]], i64 16
342 ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 16, i32 8)
336343 ; CHECK-NEXT: [[BASE64_0:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 0
337344 ; CHECK-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 1
338345 ; CHECK-NEXT: store atomic i64 1, i64* [[BASE64_1]] unordered, align 8
358365 ; CHECK-NEXT: entry:
359366 ; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8*
360367 ; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0
361 ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8)
368 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[MYBASE0]], i64 16
369 ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 16, i32 8)
362370 ; CHECK-NEXT: [[BASE64_0:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 0
363371 ; CHECK-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 1
364372 ; CHECK-NEXT: store i64 1, i64* [[BASE64_1]], align 8
3131 ; CHECK-NEXT: entry:
3232 ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
3333 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
34 ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4)
34 ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 24, i32 4)
3535 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
3636 ; CHECK-NEXT: store atomic i32 1, i32* [[ARRAYIDX1]] unordered, align 4
3737 ; CHECK-NEXT: ret void
5151 ; CHECK-NEXT: entry:
5252 ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
5353 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8*
54 ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4)
54 ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 24, i32 4)
5555 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
5656 ; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX1]], align 4
5757 ; CHECK-NEXT: ret void
8686 ; CHECK-LABEL: @write28to32_atomic(
8787 ; CHECK-NEXT: entry:
8888 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[P:%.*]] to i8*
89 ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 32, i32 4)
89 ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i32 4)
9090 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
9191 ; CHECK-NEXT: store atomic i32 1, i32* [[ARRAYIDX1]] unordered, align 4
9292 ; CHECK-NEXT: ret void
154154 ; CHECK-LABEL: @write32to36_atomic(
155155 ; CHECK-NEXT: entry:
156156 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2plusi* [[P:%.*]] to i8*
157 ; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 36, i32 4)
157 ; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 32, i32 4)
158158 ; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_VEC2PLUSI:%.*]], %struct.vec2plusi* [[P]], i64 0, i32 2
159159 ; CHECK-NEXT: store atomic i32 1, i32* [[C]] unordered, align 4
160160 ; CHECK-NEXT: ret void
172172 ; CHECK-LABEL: @write32to36_atomic_weaker(
173173 ; CHECK-NEXT: entry:
174174 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2plusi* [[P:%.*]] to i8*
175 ; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 36, i32 4)
175 ; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2plusi* @glob2 to i8*), i64 32, i32 4)
176176 ; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_VEC2PLUSI:%.*]], %struct.vec2plusi* [[P]], i64 0, i32 2
177177 ; CHECK-NEXT: store i32 1, i32* [[C]], align 4
178178 ; CHECK-NEXT: ret void
206206 ; CHECK-LABEL: @write16to32_atomic(
207207 ; CHECK-NEXT: entry:
208208 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.vec2* [[P:%.*]] to i8*
209 ; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2* @glob1 to i8*), i64 32, i32 4)
209 ; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 bitcast (%struct.vec2* @glob1 to i8*), i64 16, i32 4)
210210 ; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], %struct.vec2* [[P]], i64 0, i32 1
211211 ; CHECK-NEXT: store <4 x i32> , <4 x i32>* [[C]], align 4
212212 ; CHECK-NEXT: ret void
315315 ; CHECK-NEXT: entry:
316316 ; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8*
317317 ; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0
318 ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8)
318 ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 16, i32 8)
319319 ; CHECK-NEXT: [[BASE64_2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 2
320320 ; CHECK-NEXT: [[BASE64_3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 3
321321 ; CHECK-NEXT: store atomic i64 3, i64* [[BASE64_2]] unordered, align 8
341341 ; CHECK-NEXT: entry:
342342 ; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8*
343343 ; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0
344 ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8)
344 ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 16, i32 8)
345345 ; CHECK-NEXT: [[BASE64_2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 2
346346 ; CHECK-NEXT: [[BASE64_3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 3
347347 ; CHECK-NEXT: store i64 3, i64* [[BASE64_2]], align 8
367367 ; CHECK-NEXT: entry:
368368 ; CHECK-NEXT: [[BASE0:%.*]] = bitcast i64* [[P:%.*]] to i8*
369369 ; CHECK-NEXT: [[MYBASE0:%.*]] = getelementptr inbounds i8, i8* [[BASE0]], i64 0
370 ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 32, i32 8)
370 ; CHECK-NEXT: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[MYBASE0]], i8 0, i64 16, i32 8)
371371 ; CHECK-NEXT: [[BASE64_2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 2
372372 ; CHECK-NEXT: [[BASE64_3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 3
373373 ; CHECK-NEXT: store atomic i64 3, i64* [[BASE64_2]] unordered, align 8