Skip to content

Commit 679c8aa

Browse files
andreas-abelcopybara-github
authored andcommitted
Reduce overhead in TCMalloc benchmark
PiperOrigin-RevId: 783348068 Change-Id: If7c9612666ff31ac4f3dd6dc53752fb0410b5fc9
1 parent 32c68c2 commit 679c8aa

File tree

4 files changed

+60
-72
lines changed

4 files changed

+60
-72
lines changed

fleetbench/tcmalloc/empirical.cc

Lines changed: 28 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,9 @@ EmpiricalData::EmpiricalData(size_t seed, const absl::Span<const Entry> weights,
8686
total_bytes_allocated_(0),
8787
birth_sampler_(BirthRateDistribution(weights)),
8888
total_birth_rate_(0),
89-
death_sampler_(weights.size()) {
89+
death_sampler_(weights.size()),
90+
num_allocated_recorded_(0),
91+
bytes_allocated_recorded_(0) {
9092
// First, compute average live count for each size in a heap of size
9193
// <total_mem>.
9294
double total = 0;
@@ -168,9 +170,12 @@ void EmpiricalData::DoDeath(const size_t i) {
168170
}
169171

170172
void EmpiricalData::RecordBirth(const size_t i) {
173+
birth_or_death_.push_back(true);
171174
birth_or_death_sizes_.push_back(i);
172175
SizeState& s = state_[i];
173176
death_sampler_.AdjustWeight(i, s.death_rate);
177+
num_allocated_recorded_++;
178+
bytes_allocated_recorded_ += s.size;
174179
// We only care about keeping the number of objects correct when building the
175180
// trace. When we replay we will actually push the allocated address but
176181
// when building the trace we can just push nullptr to keep the length of live
@@ -181,14 +186,13 @@ void EmpiricalData::RecordBirth(const size_t i) {
181186
void* EmpiricalData::ReplayBirth(const size_t i) {
182187
SizeState& s = state_[i];
183188
const size_t size = s.size;
184-
total_num_allocated_++;
185-
total_bytes_allocated_ += size;
186189
void* p = alloc_(size);
187190
s.objs.push_back(p);
188191
return p;
189192
}
190193

191194
void EmpiricalData::RecordDeath(const size_t i) {
195+
birth_or_death_.push_back(false);
192196
SizeState& s = state_[i];
193197
CHECK(!s.objs.empty());
194198
birth_or_death_sizes_.push_back(i);
@@ -215,7 +219,6 @@ void EmpiricalData::RecordNext() {
215219
const double Both = B + T;
216220
absl::uniform_real_distribution<double> which(0, Both);
217221
bool do_birth = which(rng_) < B;
218-
birth_or_death_.push_back(do_birth);
219222

220223
if (do_birth) {
221224
size_t i = birth_sampler_(rng_);
@@ -226,18 +229,25 @@ void EmpiricalData::RecordNext() {
226229
}
227230
}
228231

229-
void EmpiricalData::ReplayNext() {
230-
bool do_birth = birth_or_death_[birth_or_death_index_];
231-
if (do_birth) {
232-
void* allocated = ReplayBirth(birth_or_death_sizes_[birth_or_death_index_]);
233-
TouchAllocated(allocated);
234-
} else {
235-
ReplayDeath(birth_or_death_sizes_[birth_or_death_index_],
236-
death_objects_[death_object_index_]);
237-
__builtin_prefetch(death_object_pointers_[death_object_index_], 1, 3);
238-
death_object_index_++;
232+
void EmpiricalData::ReplayTrace() {
233+
for (birth_or_death_index_ = 0, death_object_index_ = 0;
234+
birth_or_death_index_ < birth_or_death_.size();
235+
++birth_or_death_index_) {
236+
bool do_birth = birth_or_death_[birth_or_death_index_];
237+
if (do_birth) {
238+
void* allocated =
239+
ReplayBirth(birth_or_death_sizes_[birth_or_death_index_]);
240+
TouchAllocated(allocated);
241+
} else {
242+
ReplayDeath(birth_or_death_sizes_[birth_or_death_index_],
243+
death_objects_[death_object_index_]);
244+
__builtin_prefetch(death_object_pointers_[death_object_index_], /*rw=*/1,
245+
/*locality*/ 3);
246+
++death_object_index_;
247+
}
239248
}
240-
birth_or_death_index_++;
249+
total_num_allocated_ += num_allocated_recorded_;
250+
total_bytes_allocated_ += bytes_allocated_recorded_;
241251
}
242252

243253
void EmpiricalData::SnapshotLiveObjects() {
@@ -301,7 +311,7 @@ void EmpiricalData::BuildDeathObjectPointers() {
301311
death_object_pointers_.end());
302312
}
303313

304-
void EmpiricalData::RepairToSnapshotState() {
314+
void EmpiricalData::RecordRepairToSnapshotState() {
305315
// Compared to the number of live objects when the snapshot was taken each
306316
// size state either
307317
// 1) Contains the same number of live objects as when the snapshot was taken,
@@ -312,29 +322,14 @@ void EmpiricalData::RepairToSnapshotState() {
312322
// number of true deallocations.
313323
for (int i = 0; i < state_.size(); i++) {
314324
while (state_[i].objs.size() < snapshot_state_[i].objs.size()) {
315-
DoBirth(i);
325+
RecordBirth(i);
316326
}
317327
while (state_[i].objs.size() > snapshot_state_[i].objs.size()) {
318-
DoDeath(i);
328+
RecordDeath(i);
319329
}
320330
}
321331
}
322332

323-
void EmpiricalData::RestartTraceIfNecessary() {
324-
if (birth_or_death_index_ == birth_or_death_.size()) {
325-
// As the snapshotted lists of live objects will contain addresses which
326-
// have already been freed we can't just call RestoreSnapshot(). Instead
327-
// let's do the necessary allocations / deallocations to end up with the
328-
// identical number of live objects we had when initially building the
329-
// trace.
330-
RepairToSnapshotState();
331-
// After the above call we can safely run through the recorded trace
332-
// again.
333-
birth_or_death_index_ = 0;
334-
death_object_index_ = 0;
335-
}
336-
}
337-
338333
std::vector<EmpiricalData::Entry> GetEmpiricalDataEntries(
339334
absl::string_view file) {
340335
std::vector<EmpiricalData::Entry> distribution;

fleetbench/tcmalloc/empirical.h

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -203,10 +203,11 @@ class EmpiricalData {
203203
// logic of this function please see the comments within Next().
204204
void RecordNext();
205205

206-
// Replays the next alloc or dealloc we recorded when building the trace.
207-
// Also updates the indices into the recorded birth / death trace.
208-
// incremented.
209-
void ReplayNext();
206+
// Replays the recorded trace.
207+
void ReplayTrace();
208+
209+
// Returns the number of allocs and deallocs in the recorded trace.
210+
size_t TraceLength() const { return birth_or_death_.size(); }
210211

211212
std::default_random_engine* const rng() { return &rng_; }
212213

@@ -222,23 +223,17 @@ class EmpiricalData {
222223
// starting to replay the trace.
223224
void RestoreSnapshot();
224225

225-
// Restores the *lengths* of the number of live objects within each size class
226-
// to what it was after the warmup allocations were complete. This is
227-
// accomplished by either allocating or deallocating objects until the same
228-
// number of objects are live within each size class as were live after the
229-
// warmup allocations were complete. This is safe to call repeatedly.
230-
void RepairToSnapshotState();
226+
// Records a sequence of allocations and deallocations that restores the
227+
// *lengths* of the number of live objects within each size class to what it
228+
// was after the warmup allocations were complete. This function should only
229+
// be called once (at the end of the recording phase).
230+
void RecordRepairToSnapshotState();
231231

232232
// Computes addresses to prefetch when executing in record and replay mode.
233233
// This is necessary to minimize the impact of indexing into SizeState.objs
234234
// when freeing an object.
235235
void BuildDeathObjectPointers();
236236

237-
// Tests whether we have reached the end of the birth / death trace. If so
238-
// performs the actions necessary so that we can start replaying allocs /
239-
// deallocs from the beginning of the trace again.
240-
void RestartTraceIfNecessary();
241-
242237
private:
243238
std::default_random_engine rng_;
244239

@@ -276,6 +271,8 @@ class EmpiricalData {
276271
std::vector<void**> death_object_pointers_;
277272
uint32_t birth_or_death_index_ = 0;
278273
uint32_t death_object_index_ = 0;
274+
size_t num_allocated_recorded_;
275+
size_t bytes_allocated_recorded_;
279276
};
280277

281278
std::vector<EmpiricalData::Entry> GetEmpiricalDataEntries(

fleetbench/tcmalloc/empirical_driver.cc

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -44,17 +44,21 @@ namespace {
4444

4545
void* alloc(size_t s) { return ::operator new(s); }
4646

47-
static constexpr int64_t kBatch = 100;
4847
// When non-zero, empirical driver will simulate tick of ReleaseMemoryToOS
4948
// iteration, given number of bytes allocated.
5049
static constexpr int64_t kSimulatedBytesPerSec = 0;
5150
// The total number of allocs / deallocs to precalculate for later replay.
5251
// Memory required to store replay buffers scales with the number of threads.
53-
static constexpr size_t kRecordAndReplayBufferSize = 1'000'000;
52+
// The actual number of allocs / deallocs can be slightly larger than this
53+
// value, as at the end of the replay, additional operations are performed to
54+
// bring the number of live objects back to where it was at the start of the
55+
// replay.
56+
static constexpr size_t kRecordAndReplayBufferSizeTarget = 1'000'000;
5457
// Number of bytes to try to release from the page heap per second.
5558
static constexpr int64_t kEmpiricalMallocReleaseBytesPerSec = 0;
56-
// Number of iterations to warm up the benchmark before the main benchmark loop.
57-
static constexpr size_t kNumWarmUpIterations = 500000;
59+
// Number of replays of the entire trace to warm up the benchmark before the
60+
// main benchmark loop.
61+
static constexpr size_t kNumWarmUpReplays = 50;
5862

5963
class SimThread {
6064
public:
@@ -80,11 +84,10 @@ class SimThread {
8084
}
8185

8286
void RecordBirthsAndDeaths(EmpiricalData* load) {
83-
// Round number of births / deaths to record down to a multiple of kBatch.
84-
const int buffer_size = (kRecordAndReplayBufferSize / kBatch) * kBatch;
85-
for (int i = 0; i < buffer_size; ++i) {
87+
for (int i = 0; i < kRecordAndReplayBufferSizeTarget; ++i) {
8688
load->RecordNext();
8789
}
90+
load->RecordRepairToSnapshotState();
8891

8992
load->RestoreSnapshot();
9093
load->BuildDeathObjectPointers();
@@ -106,10 +109,8 @@ class SimThread {
106109

107110
void ReplayTrace() {
108111
DCHECK(done_recording_);
109-
for (int i = 0; i < kBatch; i++) {
110-
load_.ReplayNext();
111-
}
112-
load_.RestartTraceIfNecessary();
112+
load_.ReplayTrace();
113+
113114
auto allocated = load_.total_bytes_allocated();
114115
load_bytes_allocated_.store(allocated, std::memory_order_relaxed);
115116
auto total_num_allocated = load_.total_num_allocated();
@@ -122,6 +123,8 @@ class SimThread {
122123
}
123124
}
124125

126+
size_t TraceLength() { return load_.TraceLength(); }
127+
125128
private:
126129
size_t n_;
127130
size_t transient_;
@@ -217,14 +220,14 @@ static void BM_TCMalloc_Empirical_Driver(benchmark::State& state) {
217220
// We do not use the MinWarmUpTime feature of the benchmark framework here,
218221
// as that feature calls Teardown and Setup after the warm-up phase, which
219222
// resets the state that we want to establish with the warm-up.
220-
for (int i = 0; i < kNumWarmUpIterations; i++) {
223+
for (int i = 0; i < kNumWarmUpReplays; i++) {
221224
sim_threads[thread_idx]->ReplayTrace();
222225
}
223226

224227
size_t bytes_warm_up = sim_threads[thread_idx]->total_bytes_allocated();
225228
size_t allocations_warm_up = sim_threads[thread_idx]->load_allocations();
226229

227-
for (auto _ : state) {
230+
while (state.KeepRunningBatch(sim_threads[thread_idx]->TraceLength())) {
228231
sim_threads[thread_idx]->ReplayTrace();
229232
}
230233

@@ -315,7 +318,7 @@ void RegisterBenchmarks() {
315318
->Teardown(BM_TCMalloc_Empirical_Driver_Teardown)
316319
->ThreadRange(1, 1)
317320
->UseRealTime()
318-
->Iterations(100000);
321+
->Iterations(10'000'000);
319322
min_threads = 2;
320323
}
321324

fleetbench/tcmalloc/empirical_test.cc

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -56,15 +56,14 @@ TEST(EmpiricalRecordAndReplay, Basic) {
5656
for (int j = 0; j < kBufferSize; ++j) {
5757
data.RecordNext();
5858
}
59+
data.RecordRepairToSnapshotState();
5960

6061
data.RestoreSnapshot();
6162
data.BuildDeathObjectPointers();
6263

6364
// We need one warmup iteration so we can compute the delta allocations and
6465
// bytes we should see from each time through the trace.
65-
for (int j = 0; j < kBufferSize; ++j) {
66-
data.ReplayNext();
67-
}
66+
data.ReplayTrace();
6867

6968
size_t delta_allocations = data.total_num_allocated() - total_allocations;
7069
size_t delta_bytes_allocated =
@@ -76,16 +75,10 @@ TEST(EmpiricalRecordAndReplay, Basic) {
7675
EXPECT_EQ(delta_bytes_allocated,
7776
data.total_bytes_allocated() - total_bytes_allocated);
7877

79-
// Restart the trace before updating total_* so we don't capture the
80-
// "repair" operations.
81-
data.RestartTraceIfNecessary();
82-
8378
total_allocations = data.total_num_allocated();
8479
total_bytes_allocated = data.total_bytes_allocated();
8580

86-
for (int j = 0; j < kBufferSize; ++j) {
87-
data.ReplayNext();
88-
}
81+
data.ReplayTrace();
8982
}
9083
}
9184

0 commit comments

Comments
 (0)