llvm.org GIT mirror llvm / 7ebe2a2
[ARM] Add macro fusion for AES instructions. Summary: This patch adds a macro fusion using CodeGen/MacroFusion.cpp to pair AES instructions back to back and adds FeatureFuseAES to enable the feature. Reviewers: evandro, javed.absar, rengolin, t.p.northover Reviewed By: javed.absar Subscribers: aemerson, mgorny, kristof.beyls, llvm-commits Differential Revision: https://reviews.llvm.org/D34142 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@305988 91177308-0d34-0410-b5e6-96231b3b80d8 Florian Hahn 3 years ago
7 changed file(s) with 302 addition(s) and 1 deletion(s). Raw diff Collapse all Expand all
9999 "Enable Reliability, Availability and Serviceability extensions">;
100100 def FeatureFPAO : SubtargetFeature<"fpao", "HasFPAO", "true",
101101 "Enable fast computation of positive address offsets">;
102
102 def FeatureFuseAES : SubtargetFeature<"fuse-aes", "HasFuseAES", "true",
103 "CPU fuses AES crypto operations">;
103104
104105 // Cyclone has preferred instructions for zeroing VFP registers, which can
105106 // execute in 0 cycles.
0 //===- ARMMacroFusion.cpp - ARM Macro Fusion ----------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file contains the ARM implementation of the DAG scheduling
10 /// mutation to pair instructions back to back.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "ARMMacroFusion.h"
15 #include "ARMSubtarget.h"
16 #include "llvm/CodeGen/MacroFusion.h"
17 #include "llvm/Target/TargetInstrInfo.h"
18
19 namespace llvm {
20
21 /// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
22 /// together. Given SecondMI, when FirstMI is unspecified, then check if
23 /// SecondMI may be part of a fused pair at all.
24 static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
25 const TargetSubtargetInfo &TSI,
26 const MachineInstr *FirstMI,
27 const MachineInstr &SecondMI) {
28 const ARMSubtarget &ST = static_cast(TSI);
29
30 // Assume wildcards for unspecified instrs.
31 unsigned FirstOpcode =
32 FirstMI ? FirstMI->getOpcode()
33 : static_cast(ARM::INSTRUCTION_LIST_END);
34 unsigned SecondOpcode = SecondMI.getOpcode();
35
36 if (ST.hasFuseAES())
37 // Fuse AES crypto operations.
38 switch(SecondOpcode) {
39 // AES encode.
40 case ARM::AESMC :
41 return FirstOpcode == ARM::AESE ||
42 FirstOpcode == ARM::INSTRUCTION_LIST_END;
43 // AES decode.
44 case ARM::AESIMC:
45 return FirstOpcode == ARM::AESD ||
46 FirstOpcode == ARM::INSTRUCTION_LIST_END;
47 }
48
49 return false;
50 }
51
52 std::unique_ptr createARMMacroFusionDAGMutation () {
53 return createMacroFusionDAGMutation(shouldScheduleAdjacent);
54 }
55
56 } // end namespace llvm
0 //===- ARMMacroFusion.h - ARM Macro Fusion ------------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file contains the ARM definition of the DAG scheduling mutation
10 /// to pair instructions back to back.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "llvm/CodeGen/MachineScheduler.h"
15
16 namespace llvm {
17
18 /// Note that you have to add:
19 /// DAG.addMutation(createARMMacroFusionDAGMutation());
20 /// to ARMPassConfig::createMachineScheduler() to have an effect.
21 std::unique_ptr createARMMacroFusionDAGMutation();
22
23 } // llvm
283283
284284 /// HasFPAO - if true, processor does positive address offset computation faster
285285 bool HasFPAO = false;
286
287 /// HasFuseAES - if true, processor executes back to back AES instruction
288 /// pairs faster.
289 bool HasFuseAES = false;
286290
287291 /// If true, if conversion may decide to leave some instructions unpredicated.
288292 bool IsProfitableToUnpredicate = false;
560564 bool hasD16() const { return HasD16; }
561565 bool hasFullFP16() const { return HasFullFP16; }
562566
567 bool hasFuseAES() const { return HasFuseAES; }
568 /// \brief Return true if the CPU supports any kind of instruction fusion.
569 bool hasFusion() const { return hasFuseAES(); }
570
563571 const Triple &getTargetTriple() const { return TargetTriple; }
564572
565573 bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
1616 #include "ARMRegisterBankInfo.h"
1717 #endif
1818 #include "ARMSubtarget.h"
19 #include "ARMMacroFusion.h"
1920 #include "ARMTargetMachine.h"
2021 #include "ARMTargetObjectFile.h"
2122 #include "ARMTargetTransformInfo.h"
393394 createMachineScheduler(MachineSchedContext *C) const override {
394395 ScheduleDAGMILive *DAG = createGenericSchedLive(C);
395396 // add DAG Mutations here.
397 const ARMSubtarget &ST = C->MF->getSubtarget();
398 if (ST.hasFusion())
399 DAG->addMutation(createARMMacroFusionDAGMutation());
396400 return DAG;
397401 }
398402
400404 createPostMachineScheduler(MachineSchedContext *C) const override {
401405 ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
402406 // add DAG Mutations here.
407 const ARMSubtarget &ST = C->MF->getSubtarget();
408 if (ST.hasFusion())
409 DAG->addMutation(createARMMacroFusionDAGMutation());
403410 return DAG;
404411 }
405412
4848 ARMLoadStoreOptimizer.cpp
4949 ARMMCInstLower.cpp
5050 ARMMachineFunctionInfo.cpp
51 ARMMacroFusion.cpp
5152 ARMRegisterInfo.cpp
5253 ARMOptimizeBarriersPass.cpp
5354 ARMSelectionDAGInfo.cpp
0 ; RUN: llc %s -o - -mtriple=armv8 -mattr=+crypto,+fuse-aes -enable-misched -disable-post-ra | FileCheck %s
1
2 declare <16 x i8> @llvm.arm.neon.aese(<16 x i8> %d, <16 x i8> %k)
3 declare <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %d)
4 declare <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %d, <16 x i8> %k)
5 declare <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %d)
6
7 define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, <16 x i8> %e) {
8 %d0 = load <16 x i8>, <16 x i8>* %a0
9 %a1 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 1
10 %d1 = load <16 x i8>, <16 x i8>* %a1
11 %a2 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 2
12 %d2 = load <16 x i8>, <16 x i8>* %a2
13 %a3 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 3
14 %d3 = load <16 x i8>, <16 x i8>* %a3
15 %k0 = load <16 x i8>, <16 x i8>* %b0
16 %e00 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %d0, <16 x i8> %k0)
17 %f00 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %e00)
18 %e01 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %d1, <16 x i8> %k0)
19 %f01 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %e01)
20 %e02 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %d2, <16 x i8> %k0)
21 %f02 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %e02)
22 %e03 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %d3, <16 x i8> %k0)
23 %f03 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %e03)
24 %b1 = getelementptr inbounds <16 x i8>, <16 x i8>* %b0, i64 1
25 %k1 = load <16 x i8>, <16 x i8>* %b1
26 %e10 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %f00, <16 x i8> %k1)
27 %f10 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %e00)
28 %e11 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %f01, <16 x i8> %k1)
29 %f11 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %e01)
30 %e12 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %f02, <16 x i8> %k1)
31 %f12 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %e02)
32 %e13 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %f03, <16 x i8> %k1)
33 %f13 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %e03)
34 %b2 = getelementptr inbounds <16 x i8>, <16 x i8>* %b0, i64 2
35 %k2 = load <16 x i8>, <16 x i8>* %b2
36 %e20 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %f10, <16 x i8> %k2)
37 %f20 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %e10)
38 %e21 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %f11, <16 x i8> %k2)
39 %f21 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %e11)
40 %e22 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %f12, <16 x i8> %k2)
41 %f22 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %e12)
42 %e23 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %f13, <16 x i8> %k2)
43 %f23 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %e13)
44 %b3 = getelementptr inbounds <16 x i8>, <16 x i8>* %b0, i64 3
45 %k3 = load <16 x i8>, <16 x i8>* %b3
46 %e30 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %f20, <16 x i8> %k3)
47 %f30 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %e20)
48 %e31 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %f21, <16 x i8> %k3)
49 %f31 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %e21)
50 %e32 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %f22, <16 x i8> %k3)
51 %f32 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %e22)
52 %e33 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %f23, <16 x i8> %k3)
53 %f33 = call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %e23)
54 %g0 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %f30, <16 x i8> %d)
55 %h0 = xor <16 x i8> %g0, %e
56 %g1 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %f31, <16 x i8> %d)
57 %h1 = xor <16 x i8> %g1, %e
58 %g2 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %f32, <16 x i8> %d)
59 %h2 = xor <16 x i8> %g2, %e
60 %g3 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %f33, <16 x i8> %d)
61 %h3 = xor <16 x i8> %g3, %e
62 store <16 x i8> %h0, <16 x i8>* %c0
63 %c1 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 1
64 store <16 x i8> %h1, <16 x i8>* %c1
65 %c2 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 2
66 store <16 x i8> %h2, <16 x i8>* %c2
67 %c3 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 3
68 store <16 x i8> %h3, <16 x i8>* %c3
69 ret void
70
71 ; CHECK-LABEL: aesea:
72 ; CHECK: aese.8 [[QA:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
73 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QA]]
74 ; CHECK: aese.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
75 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QB]]
76 ; CHECK: aese.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
77 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QC]]
78 ; CHECK: aese.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
79 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QD]]
80 ; CHECK: aese.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
81 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QE]]
82 ; CHECK: aese.8 [[QF:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
83 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QF]]
84 ; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
85 ; CHECK: aese.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
86 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QG]]
87 ; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
88 ; CHECK: aese.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
89 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QH]]
90 }
91
92 define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, <16 x i8> %e) {
93 %d0 = load <16 x i8>, <16 x i8>* %a0
94 %a1 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 1
95 %d1 = load <16 x i8>, <16 x i8>* %a1
96 %a2 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 2
97 %d2 = load <16 x i8>, <16 x i8>* %a2
98 %a3 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 3
99 %d3 = load <16 x i8>, <16 x i8>* %a3
100 %k0 = load <16 x i8>, <16 x i8>* %b0
101 %e00 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %d0, <16 x i8> %k0)
102 %f00 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %e00)
103 %e01 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %d1, <16 x i8> %k0)
104 %f01 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %e01)
105 %e02 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %d2, <16 x i8> %k0)
106 %f02 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %e02)
107 %e03 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %d3, <16 x i8> %k0)
108 %f03 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %e03)
109 %b1 = getelementptr inbounds <16 x i8>, <16 x i8>* %b0, i64 1
110 %k1 = load <16 x i8>, <16 x i8>* %b1
111 %e10 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %f00, <16 x i8> %k1)
112 %f10 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %e00)
113 %e11 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %f01, <16 x i8> %k1)
114 %f11 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %e01)
115 %e12 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %f02, <16 x i8> %k1)
116 %f12 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %e02)
117 %e13 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %f03, <16 x i8> %k1)
118 %f13 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %e03)
119 %b2 = getelementptr inbounds <16 x i8>, <16 x i8>* %b0, i64 2
120 %k2 = load <16 x i8>, <16 x i8>* %b2
121 %e20 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %f10, <16 x i8> %k2)
122 %f20 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %e10)
123 %e21 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %f11, <16 x i8> %k2)
124 %f21 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %e11)
125 %e22 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %f12, <16 x i8> %k2)
126 %f22 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %e12)
127 %e23 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %f13, <16 x i8> %k2)
128 %f23 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %e13)
129 %b3 = getelementptr inbounds <16 x i8>, <16 x i8>* %b0, i64 3
130 %k3 = load <16 x i8>, <16 x i8>* %b3
131 %e30 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %f20, <16 x i8> %k3)
132 %f30 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %e20)
133 %e31 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %f21, <16 x i8> %k3)
134 %f31 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %e21)
135 %e32 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %f22, <16 x i8> %k3)
136 %f32 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %e22)
137 %e33 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %f23, <16 x i8> %k3)
138 %f33 = call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %e23)
139 %g0 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %f30, <16 x i8> %d)
140 %h0 = xor <16 x i8> %g0, %e
141 %g1 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %f31, <16 x i8> %d)
142 %h1 = xor <16 x i8> %g1, %e
143 %g2 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %f32, <16 x i8> %d)
144 %h2 = xor <16 x i8> %g2, %e
145 %g3 = call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %f33, <16 x i8> %d)
146 %h3 = xor <16 x i8> %g3, %e
147 store <16 x i8> %h0, <16 x i8>* %c0
148 %c1 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 1
149 store <16 x i8> %h1, <16 x i8>* %c1
150 %c2 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 2
151 store <16 x i8> %h2, <16 x i8>* %c2
152 %c3 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 3
153 store <16 x i8> %h3, <16 x i8>* %c3
154 ret void
155
156 ; CHECK-LABEL: aesda:
157 ; CHECK: aesd.8 [[QA:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
158 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QA]]
159 ; CHECK: aesd.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
160 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QB]]
161 ; CHECK: aesd.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
162 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QC]]
163 ; CHECK: aesd.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
164 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QD]]
165 ; CHECK: aesd.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
166 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QE]]
167 ; CHECK: aesd.8 [[QF:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
168 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QF]]
169 ; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
170 ; CHECK: aesd.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
171 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QG]]
172 ; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}}
173 ; CHECK: aesd.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
174 ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QH]]
175 }
176
177 define void @aes_load_store(<16 x i8> *%p1, <16 x i8> *%p2 , <16 x i8> *%p3) {
178 entry:
179 %x1 = alloca <16 x i8>, align 16
180 %x2 = alloca <16 x i8>, align 16
181 %x3 = alloca <16 x i8>, align 16
182 %x4 = alloca <16 x i8>, align 16
183 %x5 = alloca <16 x i8>, align 16
184 %in1 = load <16 x i8>, <16 x i8>* %p1, align 16
185 store <16 x i8> %in1, <16 x i8>* %x1, align 16
186 %aese1 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %in1, <16 x i8> %in1) #2
187 store <16 x i8> %aese1, <16 x i8>* %x2, align 16
188 %in2 = load <16 x i8>, <16 x i8>* %p2, align 16
189 %aesmc1= call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %aese1) #2
190 store <16 x i8> %aesmc1, <16 x i8>* %x3, align 16
191 %aese2 = call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %in1, <16 x i8> %in2) #2
192 store <16 x i8> %aese2, <16 x i8>* %x4, align 16
193 %aesmc2= call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %aese2) #2
194 store <16 x i8> %aesmc2, <16 x i8>* %x5, align 16
195 ret void
196
197 ; CHECK-LABEL: aes_load_store:
198 ; CHECK: aese.8 [[QA:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
199 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QA]]
200 ; CHECK: aese.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}}
201 ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QB]]
202 }