Skip to content

Commit b43983c

Browse files
apronchenkovcopybara-github
authored andcommitted
Refactor Koladata pseudo-random generator to use a weak symbol for reproducibility
This patch replaces the mechanism used to de-randomize Koladata. Instead of relying on an environment variable, it now uses a weak symbol `_KoladataInternalPseudoRandomUint64`. This allows the behaviour to be controlled by adding a build dependency or by using LD_PRELOAD. The primary motivation is to remove "special" mode logic from client applications. Specifically, the overloaded version can enforce single-thread access to the random number generator without imposing this logic on the default implementation. PiperOrigin-RevId: 843637091 Change-Id: Ia2b8fb2348b0529138dadb8ce1f3d7cb41138c42
1 parent 5b4aa1a commit b43983c

File tree

12 files changed

+96
-171
lines changed

12 files changed

+96
-171
lines changed

koladata/BUILD

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ cc_library(
8888
"//koladata/internal:data_bag",
8989
"//koladata/internal:data_item",
9090
"//koladata/internal:object_id",
91-
"//koladata/internal:random",
91+
"//koladata/internal:pseudo_random",
9292
"//koladata/internal:schema_attrs",
9393
"//koladata/internal:triples",
9494
"@com_google_absl//absl/base:core_headers",

koladata/data_bag.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
#include "arolla/util/repr.h"
3939
#include "koladata/internal/data_bag.h"
4040
#include "koladata/internal/object_id.h"
41-
#include "koladata/internal/random.h"
41+
#include "koladata/internal/pseudo_random.h"
4242

4343
namespace koladata {
4444

@@ -225,7 +225,7 @@ class DataBag : public arolla::RefcountedBase {
225225
has_mutable_fallbacks_(false),
226226
// NOTE: consider lazy initialization of the fingerprint if it becomes
227227
// expensive to compute.
228-
fingerprint_(internal::MaybeDeterministicRandomFingerprint()) {}
228+
fingerprint_(internal::PseudoRandomFingerprint()) {}
229229

230230
// Returns a mutable DataBag that wraps provided low-level DataBagImpl.
231231
static DataBagPtr FromImpl(internal::DataBagImplPtr impl);

koladata/expr/BUILD

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ cc_library(
129129
hdrs = ["non_determinism.h"],
130130
deps = [
131131
":init",
132-
"//koladata/internal:random",
132+
"//koladata/internal:pseudo_random",
133133
"@com_google_absl//absl/base:no_destructor",
134134
"@com_google_absl//absl/status:statusor",
135135
"@com_google_absl//absl/strings:string_view",

koladata/expr/non_determinism.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
#include "arolla/expr/expr.h"
2323
#include "arolla/expr/expr_node.h"
2424
#include "arolla/expr/registered_expr_operator.h"
25-
#include "koladata/internal/random.h"
25+
#include "koladata/internal/pseudo_random.h"
2626

2727
namespace koladata::expr {
2828

@@ -35,9 +35,9 @@ absl::StatusOr<arolla::expr::ExprNodePtr> GenNonDeterministicToken() {
3535
static const absl::NoDestructor op(
3636
std::make_shared<RegisteredOperator>("koda_internal.non_deterministic"));
3737
static const absl::NoDestructor leaf(Leaf(kNonDeterministicTokenLeafKey));
38-
return MakeOpNode(*op,
39-
{*leaf, Literal(static_cast<int64_t>(
40-
internal::MaybeDeterministicRandomUint64()))});
38+
return MakeOpNode(
39+
*op,
40+
{*leaf, Literal(static_cast<int64_t>(internal::PseudoRandomUint64()))});
4141
}
4242

4343
} // namespace koladata::expr

koladata/internal/BUILD

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ cc_library(
6565
srcs = ["object_id.cc"],
6666
hdrs = ["object_id.h"],
6767
deps = [
68-
":random",
68+
":pseudo_random",
6969
"//koladata/internal/op_utils:base62",
7070
"@com_google_absl//absl/base:core_headers",
7171
"@com_google_absl//absl/container:flat_hash_set",
@@ -1089,26 +1089,22 @@ cc_library(
10891089
)
10901090

10911091
cc_library(
1092-
name = "random",
1093-
srcs = ["random.cc"],
1094-
hdrs = ["random.h"],
1092+
name = "pseudo_random",
1093+
srcs = ["pseudo_random.cc"],
1094+
hdrs = ["pseudo_random.h"],
10951095
deps = [
10961096
"@com_google_absl//absl/base:core_headers",
1097-
"@com_google_absl//absl/base:no_destructor",
10981097
"@com_google_absl//absl/numeric:int128",
10991098
"@com_google_absl//absl/random",
1100-
"@com_google_absl//absl/random:distributions",
1101-
"@com_google_absl//absl/strings:string_view",
1102-
"@com_google_absl//absl/synchronization",
11031099
"@com_google_arolla//arolla/util",
11041100
],
11051101
)
11061102

11071103
cc_test(
1108-
name = "random_test",
1109-
srcs = ["random_test.cc"],
1104+
name = "pseudo_random_test",
1105+
srcs = ["pseudo_random_test.cc"],
11101106
deps = [
1111-
":random",
1107+
":pseudo_random",
11121108
"@com_google_googletest//:gtest_main",
11131109
],
11141110
)

koladata/internal/object_id.cc

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,15 @@
3131
#include "absl/strings/str_cat.h"
3232
#include "absl/strings/str_join.h"
3333
#include "arolla/qtype/simple_qtype.h"
34-
#include "koladata/internal/random.h"
34+
#include "koladata/internal/pseudo_random.h"
3535

3636
namespace koladata::internal {
3737

3838
namespace {
3939

4040
uint64_t AllocatorId() {
4141
static uint64_t kAllocatorId =
42-
MaybeDeterministicRandomUint64() & ((1ull << 52) - 1);
42+
PseudoRandomUint64() & ((1ull << 52) - 1);
4343
return kAllocatorId;
4444
}
4545

@@ -105,9 +105,6 @@ AllocationId Allocate(size_t size) {
105105
id << id_bit_count);
106106
};
107107

108-
// TODO: b/464002636 — If we decicde to force single-threaded usage, when
109-
// KOLADATA_DETERMINISTIC_SEED variable is set, we have to add an assertion
110-
// here as well.
111108
thread_local std::array<std::pair<uint64_t, uint64_t>, kMaxOffsetBits + 1>
112109
thread_id_per_offset_;
113110

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -12,29 +12,28 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414
//
15-
#ifndef KOLADATA_INTERNAL_RANDOM_H_
16-
#define KOLADATA_INTERNAL_RANDOM_H_
15+
#include "koladata/internal/pseudo_random.h"
1716

1817
#include <cstdint>
1918

19+
#include "absl/base/attributes.h"
20+
#include "absl/numeric/int128.h"
21+
#include "absl/random/random.h"
2022
#include "arolla/util/fingerprint.h"
2123

24+
// Note: Implement the function as a weak symbol so that this implementation can
25+
// be overridden by a build dependency or via LD_PRELOAD.
26+
extern "C" ABSL_ATTRIBUTE_WEAK uint64_t KoladataInternalPseudoRandomUint64() {
27+
static thread_local absl::BitGen bitgen;
28+
return absl::Uniform<uint64_t>(bitgen);
29+
}
30+
2231
namespace koladata::internal {
2332

24-
// Returns a random uniformly distributed uint64.
25-
//
26-
// If KOLADATA_DETERMINISTIC_SEED environment variable is set (the changes of
27-
// the value in runtime are ignored), the random number generator will be
28-
// deterministic.
29-
uint64_t MaybeDeterministicRandomUint64();
30-
31-
// Returns a random uniformly distributed fingerprint.
32-
//
33-
// If KOLADATA_DETERMINISTIC_SEED environment variable is set (the changes of
34-
// the value in runtime are ignored), the random number generator will be
35-
// deterministic.
36-
arolla::Fingerprint MaybeDeterministicRandomFingerprint();
33+
arolla::Fingerprint PseudoRandomFingerprint() {
34+
return arolla::Fingerprint(
35+
absl::MakeUint128(KoladataInternalPseudoRandomUint64(),
36+
KoladataInternalPseudoRandomUint64()));
37+
}
3738

3839
} // namespace koladata::internal
39-
40-
#endif // KOLADATA_INTERNAL_RANDOM_H_

koladata/internal/pseudo_random.h

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
// Copyright 2025 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
//
15+
#ifndef KOLADATA_INTERNAL_PSEUDO_RANDOM_H_
16+
#define KOLADATA_INTERNAL_PSEUDO_RANDOM_H_
17+
18+
#include <cstdint>
19+
20+
#include "arolla/util/fingerprint.h"
21+
22+
// Returns a uniformly distributed pseudo-random uint64.
23+
//
24+
// While not cryptographically secure, the default implementation generates
25+
// unique sequences of random numbers for each process.
26+
//
27+
// Note: This function supports overriding via LD_PRELOAD. It is declared as "C"
28+
// function because providing multiple definitions in C++ constitutes an ODR
29+
// violation.
30+
//
31+
// WARNING: Koladata relies on high-entropy randomness to prevent collisions
32+
// across processes. If you override this function to return a fixed sequence,
33+
// you must ensure the process is isolated; it must not interact with any
34+
// other process using the same sequence.
35+
extern "C" uint64_t KoladataInternalPseudoRandomUint64();
36+
37+
namespace koladata::internal {
38+
39+
// Returns a pseudo-random uniformly distributed uint64.
40+
inline uint64_t PseudoRandomUint64() {
41+
return KoladataInternalPseudoRandomUint64();
42+
}
43+
44+
// Returns a pseudo-random uniformly distributed fingerprint.
45+
arolla::Fingerprint PseudoRandomFingerprint();
46+
47+
} // namespace koladata::internal
48+
49+
#endif // KOLADATA_INTERNAL_PSEUDO_RANDOM_H_
Lines changed: 7 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414
//
15-
#include "koladata/internal/random.h"
15+
#include "koladata/internal/pseudo_random.h"
1616

1717
#include <cstdint>
1818
#include <thread> // NOLINT(build/c++11): only used for tests.
@@ -21,48 +21,17 @@
2121
#include "gtest/gtest.h"
2222

2323
namespace koladata::internal {
24-
25-
uint64_t DeterministicRandomUint64();
26-
2724
namespace {
2825

29-
TEST(RandomTest, MaybeDeterministicRandomUint64) {
30-
EXPECT_NE(MaybeDeterministicRandomUint64(), MaybeDeterministicRandomUint64());
31-
}
32-
33-
TEST(RandomTest, MaybeDeterministicRandomFingerprint) {
34-
EXPECT_NE(MaybeDeterministicRandomFingerprint(),
35-
MaybeDeterministicRandomFingerprint());
26+
TEST(PseudoRandomTest, PseudoRandomUint64) {
27+
EXPECT_NE(PseudoRandomUint64(), PseudoRandomUint64());
3628
}
3729

38-
TEST(RandomTest, MaybeDeterministicRandomUint64_MultiThreaded) {
39-
constexpr int kNumThreads = 10;
40-
constexpr int kNumValues = 10;
41-
std::vector<std::vector<uint64_t>> results(kNumThreads);
42-
43-
std::vector<std::thread> threads;
44-
threads.reserve(kNumThreads);
45-
for (int i = 0; i < kNumThreads; ++i) {
46-
threads.emplace_back([&results, i] {
47-
results[i].reserve(kNumValues);
48-
for (int j = 0; j < kNumValues; ++j) {
49-
results[i].push_back(MaybeDeterministicRandomUint64());
50-
}
51-
});
52-
}
53-
54-
for (auto& thread : threads) {
55-
thread.join();
56-
}
57-
58-
for (int i = 0; i < kNumThreads; ++i) {
59-
for (int j = i + 1; j < kNumThreads; ++j) {
60-
EXPECT_NE(results[i], results[j]);
61-
}
62-
}
30+
TEST(PseudoRandomTest, PseudoRandomFingerprint) {
31+
EXPECT_NE(PseudoRandomFingerprint(), PseudoRandomFingerprint());
6332
}
6433

65-
TEST(RandomTest, DeterministicRandomUint64_MultiThreaded) {
34+
TEST(PseudoRandomTest, PseudoRandomUint64_MultiThreaded) {
6635
constexpr int kNumThreads = 10;
6736
constexpr int kNumValues = 10;
6837
std::vector<std::vector<uint64_t>> results(kNumThreads);
@@ -73,7 +42,7 @@ TEST(RandomTest, DeterministicRandomUint64_MultiThreaded) {
7342
threads.emplace_back([&results, i] {
7443
results[i].reserve(kNumValues);
7544
for (int j = 0; j < kNumValues; ++j) {
76-
results[i].push_back(DeterministicRandomUint64());
45+
results[i].push_back(PseudoRandomUint64());
7746
}
7847
});
7948
}

koladata/internal/random.cc

Lines changed: 0 additions & 84 deletions
This file was deleted.

0 commit comments

Comments
 (0)