llvm.org GIT mirror llvm / 94471e3
[llvm-profdata] Speed up merging by using a thread pool Add a "-j" option to llvm-profdata to control the number of threads used. Auto-detect NumThreads when it isn't specified, and avoid spawning threads when they wouldn't be beneficial. I tested this patch using a raw profile produced by clang (147MB). Here is the time taken to merge 4 copies together on my laptop: No thread pool: 112.87s user 5.92s system 97% cpu 2:01.08 total With 2 threads: 134.99s user 26.54s system 164% cpu 1:33.31 total Differential Revision: https://reviews.llvm.org/D22438 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@275921 91177308-0d34-0410-b5e6-96231b3b80d8 Vedant Kumar 3 years ago
6 changed file(s) with 197 addition(s) and 25 deletion(s). Raw diff Collapse all Expand all
105105 conjunction with -instr. Defaults to false, since it can inhibit compiler
106106 optimization during PGO.
107107
108 .. option:: -num-threads=N, -j=N
109
110 Use N threads to perform profile merging. When N=0, llvm-profdata auto-detects
111 an appropriate number of threads to use. This is the default.
112
108113 EXAMPLES
109114 ^^^^^^^^
110115 Basic Usage
4646 /// for this function and the hash and number of counts match, each counter is
4747 /// summed. Optionally scale counts by \p Weight.
4848 Error addRecord(InstrProfRecord &&I, uint64_t Weight = 1);
49 /// Merge existing function counts from the given writer.
50 Error mergeRecordsFromWriter(InstrProfWriter &&IPW);
4951 /// Write the profile to \c OS
5052 void write(raw_fd_ostream &OS);
5153 /// Write the profile in text format to \c OS
181181 return Dest.takeError();
182182 }
183183
184 Error InstrProfWriter::mergeRecordsFromWriter(InstrProfWriter &&IPW) {
185 for (auto &I : IPW.FunctionData)
186 for (auto &Func : I.getValue())
187 if (Error E = addRecord(std::move(Func.second), 1))
188 return E;
189 return Error::success();
190 }
191
184192 bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) {
185193 if (!Sparse)
186194 return true;
5050 DISJOINT: Total functions: 2
5151 DISJOINT: Maximum function count: 1
5252 DISJOINT: Maximum internal block count: 3
53
54 RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
55 RUN: %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
56 RUN: -num-threads 2 -o %t
57 RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO4
58 RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
59 RUN: %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
60 RUN: -j 3 -o %t
61 RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO4
62 FOO4: foo:
63 FOO4: Counters: 3
64 FOO4: Function count: 4
65 FOO4: Block counts: [8, 12]
66 FOO4: Total functions: 1
67 FOO4: Maximum function count: 4
68 FOO4: Maximum internal block count: 12
69
70 RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
71 RUN: %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
72 RUN: %p/Inputs/foo3-1.proftext -j 2 -o %t
73 RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO5
74 RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
75 RUN: %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
76 RUN: %p/Inputs/foo3-1.proftext -j 3 -o %t
77 RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO5
78 RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
79 RUN: %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
80 RUN: %p/Inputs/foo3-1.proftext -o %t
81 RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO5
82 RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
83 RUN: %p/Inputs/foo3-1.proftext %p/Inputs/foo3-1.proftext \
84 RUN: %p/Inputs/foo3-1.proftext -j 1 -o %t
85 RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO5
86 FOO5: foo:
87 FOO5: Counters: 3
88 FOO5: Function count: 5
89 FOO5: Block counts: [10, 15]
90 FOO5: Total functions: 1
91 FOO5: Maximum function count: 5
92 FOO5: Maximum internal block count: 15
2828 #include "llvm/Support/Path.h"
2929 #include "llvm/Support/PrettyStackTrace.h"
3030 #include "llvm/Support/Signals.h"
31 #include "llvm/Support/ThreadPool.h"
3132 #include "llvm/Support/raw_ostream.h"
3233 #include
3334
116117 };
117118 typedef SmallVector WeightedFileVector;
118119
120 /// Keep track of merged data and reported errors.
121 struct WriterContext {
122 std::mutex Lock;
123 InstrProfWriter Writer;
124 Error Err;
125 StringRef ErrWhence;
126 std::mutex &ErrLock;
127 SmallSet &WriterErrorCodes;
128
129 WriterContext(bool IsSparse, std::mutex &ErrLock,
130 SmallSet &WriterErrorCodes)
131 : Lock(), Writer(IsSparse), Err(Error::success()), ErrWhence(""),
132 ErrLock(ErrLock), WriterErrorCodes(WriterErrorCodes) {}
133 };
134
135 /// Load an input into a writer context.
136 static void loadInput(const WeightedFile &Input, WriterContext *WC) {
137 std::unique_lock CtxGuard{WC->Lock};
138
139 // If there's a pending hard error, don't do more work.
140 if (WC->Err)
141 return;
142
143 WC->ErrWhence = Input.Filename;
144
145 auto ReaderOrErr = InstrProfReader::create(Input.Filename);
146 if ((WC->Err = ReaderOrErr.takeError()))
147 return;
148
149 auto Reader = std::move(ReaderOrErr.get());
150 bool IsIRProfile = Reader->isIRLevelProfile();
151 if (WC->Writer.setIsIRLevelProfile(IsIRProfile)) {
152 WC->Err = make_error(
153 "Merge IR generated profile with Clang generated profile.",
154 std::error_code());
155 return;
156 }
157
158 for (auto &I : *Reader) {
159 if (Error E = WC->Writer.addRecord(std::move(I), Input.Weight)) {
160 // Only show hint the first time an error occurs.
161 instrprof_error IPE = InstrProfError::take(std::move(E));
162 std::unique_lock ErrGuard{WC->ErrLock};
163 bool firstTime = WC->WriterErrorCodes.insert(IPE).second;
164 handleMergeWriterError(make_error(IPE), Input.Filename,
165 I.Name, firstTime);
166 }
167 }
168 if (Reader->hasError())
169 WC->Err = Reader->getError();
170 }
171
172 /// Merge the \p Src writer context into \p Dst.
173 static void mergeWriterContexts(WriterContext *Dst, WriterContext *Src) {
174 if (Error E = Dst->Writer.mergeRecordsFromWriter(std::move(Src->Writer)))
175 Dst->Err = std::move(E);
176 }
177
119178 static void mergeInstrProfile(const WeightedFileVector &Inputs,
120179 StringRef OutputFilename,
121 ProfileFormat OutputFormat, bool OutputSparse) {
180 ProfileFormat OutputFormat, bool OutputSparse,
181 unsigned NumThreads) {
122182 if (OutputFilename.compare("-") == 0)
123183 exitWithError("Cannot write indexed profdata format to stdout.");
124184
130190 if (EC)
131191 exitWithErrorCode(EC, OutputFilename);
132192
133 InstrProfWriter Writer(OutputSparse);
193 std::mutex ErrorLock;
134194 SmallSet WriterErrorCodes;
135 for (const auto &Input : Inputs) {
136 auto ReaderOrErr = InstrProfReader::create(Input.Filename);
137 if (Error E = ReaderOrErr.takeError())
138 exitWithError(std::move(E), Input.Filename);
139
140 auto Reader = std::move(ReaderOrErr.get());
141 bool IsIRProfile = Reader->isIRLevelProfile();
142 if (Writer.setIsIRLevelProfile(IsIRProfile))
143 exitWithError("Merge IR generated profile with Clang generated profile.");
144
145 for (auto &I : *Reader) {
146 if (Error E = Writer.addRecord(std::move(I), Input.Weight)) {
147 // Only show hint the first time an error occurs.
148 instrprof_error IPE = InstrProfError::take(std::move(E));
149 bool firstTime = WriterErrorCodes.insert(IPE).second;
150 handleMergeWriterError(make_error(IPE), Input.Filename,
151 I.Name, firstTime);
152 }
153 }
154 if (Reader->hasError())
155 exitWithError(Reader->getError(), Input.Filename);
156 }
195
196 // If NumThreads is not specified, auto-detect a good default.
197 if (NumThreads == 0)
198 NumThreads = std::max(1U, std::min(std::thread::hardware_concurrency(),
199 unsigned(Inputs.size() / 2)));
200
201 // Initialize the writer contexts.
202 SmallVector, 4> Contexts;
203 for (unsigned I = 0; I < NumThreads; ++I)
204 Contexts.emplace_back(llvm::make_unique(
205 OutputSparse, ErrorLock, WriterErrorCodes));
206
207 if (NumThreads == 1) {
208 for (const auto &Input : Inputs)
209 loadInput(Input, Contexts[0].get());
210 } else {
211 ThreadPool Pool(NumThreads);
212
213 // Load the inputs in parallel (N/NumThreads serial steps).
214 unsigned Ctx = 0;
215 for (const auto &Input : Inputs) {
216 Pool.async(loadInput, Input, Contexts[Ctx].get());
217 Ctx = (Ctx + 1) % NumThreads;
218 }
219 Pool.wait();
220
221 // Merge the writer contexts together (lg(NumThreads) serial steps).
222 unsigned Mid = Contexts.size() / 2;
223 unsigned End = Contexts.size();
224 assert(Mid > 0 && "Expected more than one context");
225 do {
226 for (unsigned I = 0; I < Mid; ++I)
227 Pool.async(mergeWriterContexts, Contexts[I].get(),
228 Contexts[I + Mid].get());
229 if (End & 1)
230 Pool.async(mergeWriterContexts, Contexts[0].get(),
231 Contexts[End - 1].get());
232 Pool.wait();
233 End = Mid;
234 Mid /= 2;
235 } while (Mid > 0);
236 }
237
238 // Handle deferred hard errors encountered during merging.
239 for (std::unique_ptr &WC : Contexts)
240 if (WC->Err)
241 exitWithError(std::move(WC->Err), WC->ErrWhence);
242
243 InstrProfWriter &Writer = Contexts[0]->Writer;
157244 if (OutputFormat == PF_Text)
158245 Writer.writeText(Output);
159246 else
287374 clEnumValEnd));
288375 cl::opt OutputSparse("sparse", cl::init(false),
289376 cl::desc("Generate a sparse profile (only meaningful for -instr)"));
377 cl::opt NumThreads(
378 "num-threads", cl::init(0),
379 cl::desc("Number of merge threads to use (default: autodetect)"));
380 cl::alias NumThreadsA("j", cl::desc("Alias for --num-threads"),
381 cl::aliasopt(NumThreads));
290382
291383 cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n");
292384
313405
314406 if (ProfileKind == instr)
315407 mergeInstrProfile(WeightedInputs, OutputFilename, OutputFormat,
316 OutputSparse);
408 OutputSparse, NumThreads);
317409 else
318410 mergeSampleProfile(WeightedInputs, OutputFilename, OutputFormat);
319411
203203 delete PSFromMD;
204204 }
205205
206 TEST_F(InstrProfTest, test_writer_merge) {
207 InstrProfRecord Record1("func1", 0x1234, {42});
208 NoError(Writer.addRecord(std::move(Record1)));
209
210 InstrProfWriter Writer2;
211 InstrProfRecord Record2("func2", 0x1234, {0, 0});
212 NoError(Writer2.addRecord(std::move(Record2)));
213
214 NoError(Writer.mergeRecordsFromWriter(std::move(Writer2)));
215
216 auto Profile = Writer.writeBuffer();
217 readProfile(std::move(Profile));
218
219 Expected R = Reader->getInstrProfRecord("func1", 0x1234);
220 ASSERT_TRUE(NoError(R.takeError()));
221 ASSERT_EQ(1U, R->Counts.size());
222 ASSERT_EQ(42U, R->Counts[0]);
223
224 R = Reader->getInstrProfRecord("func2", 0x1234);
225 ASSERT_TRUE(NoError(R.takeError()));
226 ASSERT_EQ(2U, R->Counts.size());
227 ASSERT_EQ(0U, R->Counts[0]);
228 ASSERT_EQ(0U, R->Counts[1]);
229 }
230
206231 static const char callee1[] = "callee1";
207232 static const char callee2[] = "callee2";
208233 static const char callee3[] = "callee3";