Skip to content

Commit 6f530dd

Browse files
andreas-abelcopybara-github
authored andcommitted
Improve TCMalloc benchmark throughput
PiperOrigin-RevId: 786749030 Change-Id: I5de1fc501bdaa7ffb1d449c048fd2f347ad6745a
1 parent e901d0d commit 6f530dd

File tree

5 files changed

+178
-84
lines changed

5 files changed

+178
-84
lines changed

fleetbench/tcmalloc/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ cc_library(
1111
deps = [
1212
"//fleetbench/common",
1313
"@com_google_absl//absl/algorithm:container",
14+
"@com_google_absl//absl/base:core_headers",
1415
"@com_google_absl//absl/container:flat_hash_map",
1516
"@com_google_absl//absl/functional:function_ref",
1617
"@com_google_absl//absl/log:check",

fleetbench/tcmalloc/empirical.cc

Lines changed: 80 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -133,13 +133,18 @@ EmpiricalData::EmpiricalData(size_t seed, const absl::Span<const Entry> weights,
133133
}
134134

135135
SnapshotLiveObjects();
136+
137+
for (auto& s : state_) {
138+
for (size_t i = 0; i < s.objs.size(); ++i) {
139+
s.objs_indices_after_recording.push_back({.born = false, .index = i});
140+
}
141+
}
136142
}
137143

138144
EmpiricalData::~EmpiricalData() {
139145
for (auto& s : state_) {
140-
const size_t size = s.size;
141146
for (auto p : s.objs) {
142-
dealloc_(p, size);
147+
dealloc_(p, s.size);
143148
}
144149
}
145150
}
@@ -173,6 +178,7 @@ void EmpiricalData::RecordBirth(const size_t i) {
173178
birth_or_death_.push_back(true);
174179
birth_or_death_sizes_.push_back(i);
175180
SizeState& s = state_[i];
181+
birth_or_death_actual_sizes_.push_back(s.size);
176182
death_sampler_.AdjustWeight(i, s.death_rate);
177183
num_allocated_recorded_++;
178184
bytes_allocated_recorded_ += s.size;
@@ -181,13 +187,14 @@ void EmpiricalData::RecordBirth(const size_t i) {
181187
// when building the trace we can just push nullptr to keep the length of live
182188
// object lists consistent with what it should have been after a true birth.
183189
s.objs.push_back(nullptr);
190+
s.objs_indices_after_recording.push_back(
191+
{.born = true, .index = birth_object_index_});
192+
++birth_object_index_;
184193
}
185194

186-
void* EmpiricalData::ReplayBirth(const size_t i) {
187-
SizeState& s = state_[i];
188-
const size_t size = s.size;
195+
void* EmpiricalData::ReplayBirth(const size_t size) {
189196
void* p = alloc_(size);
190-
s.objs.push_back(p);
197+
*birth_pointers_[birth_object_index_] = p;
191198
return p;
192199
}
193200

@@ -196,21 +203,22 @@ void EmpiricalData::RecordDeath(const size_t i) {
196203
SizeState& s = state_[i];
197204
CHECK(!s.objs.empty());
198205
birth_or_death_sizes_.push_back(i);
206+
birth_or_death_actual_sizes_.push_back(s.size);
199207
auto to_free = absl::uniform_int_distribution<int>(
200208
0, std::max(0, static_cast<int>(s.objs.size()) - 1))(rng_);
201209
death_sampler_.AdjustWeight(i, -s.death_rate);
202210
s.objs[to_free] = s.objs.back();
203211
s.objs.pop_back();
204-
death_objects_.push_back(to_free);
212+
213+
death_obj_indices_.push_back(s.objs_indices_after_recording[to_free]);
214+
s.objs_indices_after_recording[to_free] =
215+
s.objs_indices_after_recording.back();
216+
s.objs_indices_after_recording.pop_back();
205217
}
206218

207-
void EmpiricalData::ReplayDeath(const size_t i, uint64_t index) {
208-
SizeState& s = state_[i];
209-
CHECK(!s.objs.empty());
210-
void* p = s.objs[index];
211-
s.objs[index] = s.objs.back();
212-
s.objs.pop_back();
213-
dealloc_(p, s.size);
219+
void EmpiricalData::ReplayDeath(const size_t size) {
220+
void* p = death_objects_[death_object_index_];
221+
dealloc_(p, size);
214222
}
215223

216224
void EmpiricalData::RecordNext() {
@@ -230,29 +238,43 @@ void EmpiricalData::RecordNext() {
230238
}
231239

232240
void EmpiricalData::ReplayTrace() {
233-
for (birth_or_death_index_ = 0, death_object_index_ = 0;
234-
birth_or_death_index_ < birth_or_death_.size();
235-
++birth_or_death_index_) {
236-
bool do_birth = birth_or_death_[birth_or_death_index_];
241+
for (birth_object_index_ = 0, death_object_index_ = 0;
242+
birth_object_index_ + death_object_index_ < birth_or_death_.size();) {
243+
size_t birth_or_death_index = birth_object_index_ + death_object_index_;
244+
bool do_birth = birth_or_death_[birth_or_death_index];
245+
size_t size = birth_or_death_actual_sizes_[birth_or_death_index];
246+
237247
if (do_birth) {
238-
void* allocated =
239-
ReplayBirth(birth_or_death_sizes_[birth_or_death_index_]);
248+
void* allocated = ReplayBirth(size);
240249
TouchAllocated(allocated);
250+
++birth_object_index_;
241251
} else {
242-
ReplayDeath(birth_or_death_sizes_[birth_or_death_index_],
243-
death_objects_[death_object_index_]);
244-
__builtin_prefetch(death_object_pointers_[death_object_index_], /*rw=*/1,
245-
/*locality*/ 3);
252+
ReplayDeath(size);
246253
++death_object_index_;
247254
}
248255
}
249256
total_num_allocated_ += num_allocated_recorded_;
250257
total_bytes_allocated_ += bytes_allocated_recorded_;
251258
}
252259

260+
void EmpiricalData::PrepareNextReplay() {
261+
for (auto& s : state_) {
262+
for (const auto& update : s.obj_update) {
263+
s.objs[update.to] = s.objs[update.from];
264+
}
265+
for (const auto& update : s.birth_update) {
266+
s.objs[update.to] = birth_objects_[update.from];
267+
}
268+
for (const auto& update : s.death_update) {
269+
death_objects_[update.to] = s.objs[update.from];
270+
}
271+
}
272+
}
273+
253274
void EmpiricalData::SnapshotLiveObjects() {
254275
for (const auto& s : state_) {
255-
snapshot_state_.push_back({s.size, s.death_rate, s.objs});
276+
snapshot_state_.push_back(
277+
{s.size, s.death_rate, s.objs, s.objs_indices_after_recording});
256278
}
257279
}
258280

@@ -262,53 +284,46 @@ void EmpiricalData::RestoreSnapshot() {
262284
}
263285
}
264286

265-
void EmpiricalData::ReserveSizeClassObjects() {
266-
// Keep a running sum and high water mark for the delta in the size class
267-
// object arrays.
268-
std::vector<int32_t> max_object_size_delta(state_.size(), 0);
269-
std::vector<int32_t> cur_object_size_delta(state_.size(), 0);
270-
for (int i = 0; i < birth_or_death_.size(); i++) {
271-
auto size_class = birth_or_death_sizes_[i];
287+
void EmpiricalData::BuildUpdateVectors() {
288+
birth_pointers_.resize(birth_object_index_);
289+
birth_objects_.resize(birth_object_index_);
290+
death_objects_.resize(death_obj_indices_.size());
291+
292+
for (size_t i = 0, death_index = 0; i < birth_or_death_.size(); ++i) {
293+
// Skip births
272294
if (birth_or_death_[i]) {
273-
cur_object_size_delta[size_class]++;
274-
max_object_size_delta[size_class] = std::max(
275-
max_object_size_delta[size_class], cur_object_size_delta[size_class]);
276-
} else {
277-
cur_object_size_delta[size_class]--;
295+
continue;
278296
}
279-
}
280297

281-
for (int i = 0; i < state_.size(); i++) {
282-
state_[i].objs.reserve(state_[i].objs.size() + max_object_size_delta[i]);
298+
const ObjectIndex& death_obj_index = death_obj_indices_[death_index];
299+
if (!death_obj_index.born) {
300+
SizeState& s = state_[birth_or_death_sizes_[i]];
301+
size_t from = death_obj_index.index;
302+
size_t to = death_index;
303+
s.death_update.push_back({.from = from, .to = to});
304+
death_objects_[to] = s.objs[from];
305+
} else {
306+
size_t birth_index = death_obj_index.index;
307+
birth_pointers_[birth_index] = &death_objects_[death_index];
308+
}
309+
++death_index;
283310
}
284-
}
285311

286-
void EmpiricalData::BuildDeathObjectPointers() {
287-
constexpr uint32_t kPrefetchDistance = 64;
288-
289-
// This is a bit ugly but because the below code can create pointers past the
290-
// end of the current objects arrays we need to first need to reserve their
291-
// capacity at the maximum capacity they will ever hit to ensure they won't
292-
// grow and possibly be reallocated. They will never grow beyond the size
293-
// calculated by this function.
294-
ReserveSizeClassObjects();
295-
296-
// The easiest way to compute the prefetch objects is to get the pointers
297-
// corresponding to each death_objects_[] and then rotating the array so the
298-
// N + prefetch_distance object is stored at index N.
299-
uint32_t death_index = 0;
300-
for (int i = 0; i < birth_or_death_.size(); i++) {
301-
// Skip births
302-
if (birth_or_death_[i]) {
303-
continue;
312+
for (SizeState& s : state_) {
313+
for (size_t i = 0; i < s.objs_indices_after_recording.size(); ++i) {
314+
if (!s.objs_indices_after_recording[i].born) {
315+
size_t from = s.objs_indices_after_recording[i].index;
316+
size_t to = i;
317+
if (from != to) {
318+
s.obj_update.push_back({.from = from, .to = to});
319+
}
320+
} else {
321+
size_t birth_index = s.objs_indices_after_recording[i].index;
322+
birth_pointers_[birth_index] = &birth_objects_[birth_index];
323+
s.birth_update.push_back({.from = birth_index, .to = i});
324+
}
304325
}
305-
SizeState& s = state_[birth_or_death_sizes_[i]];
306-
death_object_pointers_.push_back(s.objs.data() +
307-
death_objects_[death_index++]);
308326
}
309-
std::rotate(death_object_pointers_.begin(),
310-
death_object_pointers_.begin() + kPrefetchDistance,
311-
death_object_pointers_.end());
312327
}
313328

314329
void EmpiricalData::RecordRepairToSnapshotState() {

fleetbench/tcmalloc/empirical.h

Lines changed: 84 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,11 @@
4040

4141
#include <cstdint>
4242
#include <random>
43+
#include <string>
4344
#include <vector>
4445

4546
#include "absl/algorithm/container.h"
47+
#include "absl/base/attributes.h"
4648
#include "absl/container/flat_hash_map.h"
4749
#include "absl/functional/function_ref.h"
4850
#include "absl/log/check.h"
@@ -229,28 +231,81 @@ class EmpiricalData {
229231
// be called once (at the end of the recording phase).
230232
void RecordRepairToSnapshotState();
231233

232-
// Computes addresses to prefetch when executing in record and replay mode.
233-
// This is necessary to minimize the impact of indexing into SizeState.objs
234-
// when freeing an object.
235-
void BuildDeathObjectPointers();
234+
// Fills the update vectors in `SizeState` that are used to update the state
235+
// after each replay of the recorded trace.
236+
// This function should be called once after RestoreSnapshot().
237+
void BuildUpdateVectors();
238+
239+
// Prepares the next replay round by updating the state based on the update
240+
// vectors in `SizeState`.
241+
// Must be called after each replay, but not before the first replay.
242+
void PrepareNextReplay();
236243

237244
private:
238245
std::default_random_engine rng_;
239246

247+
struct ObjectIndex {
248+
// Whether the object was born during the recording phase.
249+
bool born : 1;
250+
size_t index : 63;
251+
};
252+
253+
struct UpdatePair {
254+
size_t from;
255+
size_t to;
256+
};
257+
240258
struct SizeState {
241259
const size_t size;
242260
const double death_rate;
243261
std::vector<void*> objs;
262+
263+
// objs_indices_after_recording[i] = j has the following meaning:
264+
// - if j.born == 0, then after the recording phase, objs[i] contains the
265+
// object that was stored in objs[j.index] before the recording phase.
266+
// - if j.born == 1, then objs[i] contains the (j.index)-th object that
267+
// was born during the recording phase.
268+
std::vector<ObjectIndex> objs_indices_after_recording;
269+
270+
// Updating the objs vector and the death_objects_ vector are relatively
271+
// expensive operations. Therefore, we try to avoid performing these updates
272+
// after each allocation/deallocation during the main benchmarking loop;
273+
// instead, we perform all of the updates (except for the updates to
274+
// death_objects_ for ojects that are born and killed in the same replay
275+
// round) together after each replay of the entire trace in a
276+
// `Pause/ResumeTiming` block so that they don't affect the benchmark
277+
// results. To perform these updates, we use the following vectors.
278+
279+
// obj_update contains pairs of the form (from, to) that are used to update
280+
// `objs` after each replay of the recorded trace. After the replay,
281+
// objs[to] is updated to store object that was in objs[from] before the
282+
// replay. The elements of the vector are ordered such that no `to` value
283+
// is equal to a `from` value that occurs later in the vector. Thus, the
284+
// update can be performed in place, and it is not necessary to create a
285+
// copy of `objs`. Pairs where `from` is equal to `to` are omitted for
286+
// efficiency reasons.
287+
std::vector<UpdatePair> obj_update;
288+
289+
// birth_update contains pairs of the form (from, to). After each replay,
290+
// objs[to] is updated to store the element that was written to
291+
// new_objects_[from] during the replay.
292+
// These updates are performed after the obj_update updates.
293+
std::vector<UpdatePair> birth_update;
294+
295+
// death_update contains pairs of the form (from, to). After each replay,
296+
// death_objects_[to] is updated to store the element that is currently
297+
// stored in objs[from].
298+
// These updates are performed after the birth_update updates.
299+
std::vector<UpdatePair> death_update;
244300
};
245301

246-
void* DoBirth(const size_t i);
247-
void DoDeath(const size_t i);
302+
void* DoBirth(size_t i);
303+
void DoDeath(size_t i);
248304

249-
void RecordBirth(const size_t i);
250-
void* ReplayBirth(const size_t i);
251-
void RecordDeath(const size_t i);
252-
void ReplayDeath(const size_t i, const uint64_t index);
253-
void ReserveSizeClassObjects();
305+
void RecordBirth(size_t i);
306+
void* ReplayBirth(size_t size);
307+
void RecordDeath(size_t i);
308+
void ReplayDeath(size_t size);
254309

255310
absl::FunctionRef<void*(size_t)> alloc_;
256311
absl::FunctionRef<void(void*, size_t)> dealloc_;
@@ -267,12 +322,27 @@ class EmpiricalData {
267322
std::vector<SizeState> snapshot_state_;
268323
std::vector<bool> birth_or_death_;
269324
std::vector<uint16_t> birth_or_death_sizes_;
270-
std::vector<uint32_t> death_objects_;
271-
std::vector<void**> death_object_pointers_;
272-
uint32_t birth_or_death_index_ = 0;
325+
std::vector<size_t> birth_or_death_actual_sizes_;
326+
std::vector<void*> death_objects_;
327+
std::vector<void*> birth_objects_;
273328
uint32_t death_object_index_ = 0;
329+
uint32_t birth_object_index_ = 0;
274330
size_t num_allocated_recorded_;
275331
size_t bytes_allocated_recorded_;
332+
333+
// birth_pointers_[i] contains the address where the i-th object that is born
334+
// during the reply should be stored. This address either points to an entry
335+
// in death_objects_ if the object is killed during the same replay round, or
336+
// to birth_objects_[i] otherwise.
337+
std::vector<void**> birth_pointers_;
338+
339+
// death_obj_indices_[i] = j has the following meaning:
340+
// - if j.born == 0, then the i-th recorded death operation kills the object
341+
// that was stored in objs[j.index] in the corresponding `SizeState`
342+
// before the recording phase.
343+
// - if j.born == 1, then the i-th recorded death operation kills the
344+
// (j.index)-th object that was born during the recording phase.
345+
std::vector<ObjectIndex> death_obj_indices_;
276346
};
277347

278348
std::vector<EmpiricalData::Entry> GetEmpiricalDataEntries(

0 commit comments

Comments
 (0)