llvm.org GIT mirror llvm / a1535e3
[NVPTX] Honor alignment on vector loads/stores We were not considering the stated alignment on vector loads/stores, leading us to generate vector instructions even when we do not have sufficient alignment. Now, for IR like: %1 = load <4 x float>, <4 x float>* %ptr, align 4 we will generate correct, conservative PTX like: ld.f32 ... [%ptr] ld.f32 ... [%ptr+4] ld.f32 ... [%ptr+8] ld.f32 ... [%ptr+12] Or if we have an alignment of 8 (for example), we can generate code like: ld.v2.f32 ... [%ptr] ld.v2.f32 ... [%ptr+8] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213186 91177308-0d34-0410-b5e6-96231b3b80d8 Justin Holewinski 6 years ago
2 changed file(s) with 108 addition(s) and 5 deletion(s). Raw diff Collapse all Expand all
14931493 break;
14941494 }
14951495
1496 MemSDNode *MemSD = cast(N);
1497 const DataLayout *TD = getDataLayout();
1498
1499 unsigned Align = MemSD->getAlignment();
1500 unsigned PrefAlign =
1501 TD->getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
1502 if (Align < PrefAlign) {
1503 // This store is not sufficiently aligned, so bail out and let this vector
1504 // store be scalarized. Note that we may still be able to emit smaller
1505 // vector stores. For example, if we are storing a <4 x float> with an
1506 // alignment of 8, this check will fail but the legalizer will try again
1507 // with 2 x <2 x float>, which will succeed with an alignment of 8.
1508 return SDValue();
1509 }
1510
14961511 unsigned Opcode = 0;
14971512 EVT EltVT = ValVT.getVectorElementType();
14981513 unsigned NumElts = ValVT.getVectorNumElements();
15341549 for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i) {
15351550 Ops.push_back(N->getOperand(i));
15361551 }
1537
1538 MemSDNode *MemSD = cast(N);
15391552
15401553 SDValue NewSt = DAG.getMemIntrinsicNode(
15411554 Opcode, DL, DAG.getVTList(MVT::Other), Ops,
30453058
30463059 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
30473060 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
3061 const DataLayout *TD,
30483062 SmallVectorImpl &Results) {
30493063 EVT ResVT = N->getValueType(0);
30503064 SDLoc DL(N);
30723086 break;
30733087 }
30743088
3089 LoadSDNode *LD = cast(N);
3090
3091 unsigned Align = LD->getAlignment();
3092 unsigned PrefAlign =
3093 TD->getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
3094 if (Align < PrefAlign) {
3095 // This load is not sufficiently aligned, so bail out and let this vector
3096 // load be scalarized. Note that we may still be able to emit smaller
3097 // vector loads. For example, if we are loading a <4 x float> with an
3098 // alignment of 8, this check will fail but the legalizer will try again
3099 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3100 return;
3101 }
3102
30753103 EVT EltVT = ResVT.getVectorElementType();
30763104 unsigned NumElts = ResVT.getVectorNumElements();
30773105
31073135 // Copy regular operands
31083136 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
31093137 OtherOps.push_back(N->getOperand(i));
3110
3111 LoadSDNode *LD = cast(N);
31123138
31133139 // The select routine does not have access to the LoadSDNode instance, so
31143140 // pass along the extension information
32823308 default:
32833309 report_fatal_error("Unhandled custom legalization");
32843310 case ISD::LOAD:
3285 ReplaceLoadVector(N, DAG, Results);
3311 ReplaceLoadVector(N, DAG, getDataLayout(), Results);
32863312 return;
32873313 case ISD::INTRINSIC_W_CHAIN:
32883314 ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
0 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
1
2 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
3 target triple = "nvptx64-nvidia-cuda"
4
5 ; CHECK-LABEL: t1
6 define <4 x float> @t1(i8* %p1) {
7 ; CHECK-NOT: ld.v4
8 ; CHECK-NOT: ld.v2
9 ; CHECK-NOT: ld.f32
10 ; CHECK: ld.u8
11 %cast = bitcast i8* %p1 to <4 x float>*
12 %r = load <4 x float>* %cast, align 1
13 ret <4 x float> %r
14 }
15
16 ; CHECK-LABEL: t2
17 define <4 x float> @t2(i8* %p1) {
18 ; CHECK-NOT: ld.v4
19 ; CHECK-NOT: ld.v2
20 ; CHECK: ld.f32
21 %cast = bitcast i8* %p1 to <4 x float>*
22 %r = load <4 x float>* %cast, align 4
23 ret <4 x float> %r
24 }
25
26 ; CHECK-LABEL: t3
27 define <4 x float> @t3(i8* %p1) {
28 ; CHECK-NOT: ld.v4
29 ; CHECK: ld.v2
30 %cast = bitcast i8* %p1 to <4 x float>*
31 %r = load <4 x float>* %cast, align 8
32 ret <4 x float> %r
33 }
34
35 ; CHECK-LABEL: t4
36 define <4 x float> @t4(i8* %p1) {
37 ; CHECK: ld.v4
38 %cast = bitcast i8* %p1 to <4 x float>*
39 %r = load <4 x float>* %cast, align 16
40 ret <4 x float> %r
41 }
42
43
44 ; CHECK-LABEL: s1
45 define void @s1(<4 x float>* %p1, <4 x float> %v) {
46 ; CHECK-NOT: st.v4
47 ; CHECK-NOT: st.v2
48 ; CHECK-NOT: st.f32
49 ; CHECK: st.u8
50 store <4 x float> %v, <4 x float>* %p1, align 1
51 ret void
52 }
53
54 ; CHECK-LABEL: s2
55 define void @s2(<4 x float>* %p1, <4 x float> %v) {
56 ; CHECK-NOT: st.v4
57 ; CHECK-NOT: st.v2
58 ; CHECK: st.f32
59 store <4 x float> %v, <4 x float>* %p1, align 4
60 ret void
61 }
62
63 ; CHECK-LABEL: s3
64 define void @s3(<4 x float>* %p1, <4 x float> %v) {
65 ; CHECK-NOT: st.v4
66 store <4 x float> %v, <4 x float>* %p1, align 8
67 ret void
68 }
69
70 ; CHECK-LABEL: s4
71 define void @s4(<4 x float>* %p1, <4 x float> %v) {
72 ; CHECK: st.v4
73 store <4 x float> %v, <4 x float>* %p1, align 16
74 ret void
75 }
76