Skip to content

Commit fec86ce

Browse files
authored
*: better locking (#4255)
Summary: 1. `dkg` and `edit` commands use `4.relay.obol.dev` as default relay. The list is to be updated later when a backup relay is added. 2. Previous lock implementation removed the lock file upon application exit, this is now changed to not remove the lock file, as we need to a) remember the timestamp, b) pass the lock to edit commands. The must be no impact to the existing `charon run` or `charon dkg` behavior. 3. `dkg` used locking unconditionally, there is no option to disable locking for `dkg`. 4. Unlike `dkg`, `charon run` requires `--private-key-file-lock` to be set, which now defaults to `true` in LCDVN. Still, `run` has ability to disable locking protection, whereas `dkg` does not. 5. The current lock update period is 1s, the lease period is 5s. This means if you restart charon (run or dkg) faster than 5s, the boot will be blocked and the process exits with the corresponding message. 6. The locking is expected to work across `run` and `dkg`. category: refactor ticket: #4242
1 parent f60764d commit fec86ce

15 files changed

+46
-57
lines changed

app/privkeylock/privkeylock.go

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ func New(privKeyFilePath, clusterLockFilePath, command string) (Service, error)
6464

6565
content, err := os.ReadFile(privKeyFileLockPath)
6666
if errors.Is(err, os.ErrNotExist) { //nolint:revive // Empty block is fine.
67-
// No file, we will create it in run
67+
// No file, we will create it in Run
6868
} else if err != nil {
6969
return Service{}, errors.Wrap(err, "read private key lock file", z.Str("path", privKeyFileLockPath))
7070
} else {
@@ -75,7 +75,7 @@ func New(privKeyFilePath, clusterLockFilePath, command string) (Service, error)
7575

7676
if time.Since(meta.Timestamp) <= staleDuration {
7777
return Service{}, errors.New(
78-
"existing private key lock file found, another charon instance may be running on your machine",
78+
"private key lock file is recently updated, another charon instance may be running on your machine",
7979
z.Str("path", privKeyFileLockPath),
8080
z.Str("command", meta.Command),
8181
z.Str("cluster_lock_hash", meta.ClusterLockHash),
@@ -100,7 +100,7 @@ func New(privKeyFilePath, clusterLockFilePath, command string) (Service, error)
100100
}
101101
}
102102

103-
if err := writePrivateKeyLockFile(privKeyFileLockPath, clusterLockHash, command, time.Now()); err != nil {
103+
if err := overwritePrivateKeyLockFile(privKeyFileLockPath, clusterLockHash, command, time.Now()); err != nil {
104104
return Service{}, err
105105
}
106106

@@ -124,7 +124,8 @@ type Service struct {
124124
done chan struct{} // Done is closed when Run exits, which exits the Close goroutine.
125125
}
126126

127-
// Run runs the service, updating the lock file every second and deleting it on context cancellation.
127+
// Run runs the service, periodically updating the lock file.
128+
// It does NOT delete the lock file on exit; callers are responsible for any cleanup.
128129
func (s Service) Run() error {
129130
defer close(s.done)
130131

@@ -134,14 +135,9 @@ func (s Service) Run() error {
134135
for {
135136
select {
136137
case <-s.quit:
137-
if err := os.Remove(s.lockFilePath); err != nil {
138-
return errors.Wrap(err, "delete private key lock file")
139-
}
140-
141138
return nil
142139
case <-tick.C:
143-
// Overwrite lockfile with new metadata
144-
if err := writePrivateKeyLockFile(s.lockFilePath, s.clusterLockHash, s.command, time.Now()); err != nil {
140+
if err := overwritePrivateKeyLockFile(s.lockFilePath, s.clusterLockHash, s.command, time.Now()); err != nil {
145141
return err
146142
}
147143
}
@@ -162,8 +158,8 @@ type metadata struct {
162158
ClusterLockHash string `json:"cluster_lock_hash,omitempty"`
163159
}
164160

165-
// writePrivateKeyLockFile creates or updates the lock file with the latest metadata.
166-
func writePrivateKeyLockFile(path, clusterLockHash, command string, now time.Time) error {
161+
// overwritePrivateKeyLockFile creates or updates the lock file with the latest metadata.
162+
func overwritePrivateKeyLockFile(path, clusterLockHash, command string, now time.Time) error {
167163
b, err := json.Marshal(metadata{Command: command, Timestamp: now, ClusterLockHash: clusterLockHash})
168164
if err != nil {
169165
return errors.Wrap(err, "marshal private key lock file")

app/privkeylock/privkeylock_internal_test.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ func TestService(t *testing.T) {
2525
writePrivateKeyFile(t, privKeyPath)
2626

2727
// Create a stale file that is ignored.
28-
err := writePrivateKeyLockFile(lockPath, "hash123", "test", time.Now().Add(-staleDuration))
28+
err := overwritePrivateKeyLockFile(lockPath, "hash123", "test", time.Now().Add(-staleDuration))
2929
require.NoError(t, err)
3030

3131
// Create a new service.
@@ -38,7 +38,7 @@ func TestService(t *testing.T) {
3838

3939
// Assert a new service can't be created.
4040
_, err = New(privKeyPath, lockFilePath, "test")
41-
require.ErrorContains(t, err, "existing private key lock file found")
41+
require.ErrorContains(t, err, "private key lock file is recently updated")
4242

4343
// Delete the file so Run will create it again.
4444
require.NoError(t, os.Remove(lockPath))
@@ -55,9 +55,9 @@ func TestService(t *testing.T) {
5555

5656
require.NoError(t, eg.Wait())
5757

58-
// Assert the file is deleted.
58+
// Assert the file is not deleted.
5959
_, openErr := os.Open(lockPath)
60-
require.ErrorIs(t, openErr, os.ErrNotExist)
60+
require.NoError(t, openErr)
6161
}
6262

6363
func TestClusterHashMismatchWithinGracePeriod(t *testing.T) {
@@ -71,7 +71,7 @@ func TestClusterHashMismatchWithinGracePeriod(t *testing.T) {
7171
writePrivateKeyFile(t, privKeyPath)
7272

7373
// Create a stale but within grace period lock file with hash1.
74-
err := writePrivateKeyLockFile(lockPath, "hash1", "test", time.Now().Add(-staleDuration-time.Second))
74+
err := overwritePrivateKeyLockFile(lockPath, "hash1", "test", time.Now().Add(-staleDuration-time.Second))
7575
require.NoError(t, err)
7676

7777
// Update cluster lock file to hash2.
@@ -95,7 +95,7 @@ func TestClusterHashMismatchAfterGracePeriod(t *testing.T) {
9595
writePrivateKeyFile(t, privKeyPath)
9696

9797
// Create an old lock file with hash1 (beyond grace period).
98-
err := writePrivateKeyLockFile(lockPath, "hash1", "test", time.Now().Add(-gracePeriod-time.Second))
98+
err := overwritePrivateKeyLockFile(lockPath, "hash1", "test", time.Now().Add(-gracePeriod-time.Second))
9999
require.NoError(t, err)
100100

101101
// Update cluster lock file to hash2.
@@ -127,7 +127,7 @@ func TestClusterHashMatchWithinGracePeriod(t *testing.T) {
127127
writePrivateKeyFile(t, privKeyPath)
128128

129129
// Create a recent lock file with hash1 (within stale duration).
130-
err := writePrivateKeyLockFile(lockPath, "hash1", "test", time.Now().Add(-time.Second))
130+
err := overwritePrivateKeyLockFile(lockPath, "hash1", "test", time.Now().Add(-time.Second))
131131
require.NoError(t, err)
132132

133133
// Try to create service with same hash - should fail due to staleness check.
@@ -136,7 +136,7 @@ func TestClusterHashMatchWithinGracePeriod(t *testing.T) {
136136
require.ErrorContains(t, err, "another charon instance may be running")
137137

138138
// Now create a stale lock file with same hash (beyond stale duration but within grace period).
139-
err = writePrivateKeyLockFile(lockPath, "hash1", "test", time.Now().Add(-staleDuration-time.Second))
139+
err = overwritePrivateKeyLockFile(lockPath, "hash1", "test", time.Now().Add(-staleDuration-time.Second))
140140
require.NoError(t, err)
141141

142142
// Should succeed since hash matches and file is stale.
@@ -211,7 +211,7 @@ func TestEmptyHashToHashMigration(t *testing.T) {
211211
writePrivateKeyFile(t, privKeyPath)
212212

213213
// Create a stale lock file with empty cluster hash (migration scenario).
214-
err := writePrivateKeyLockFile(lockPath, "", "test", time.Now().Add(-staleDuration*2))
214+
err := overwritePrivateKeyLockFile(lockPath, "", "test", time.Now().Add(-staleDuration*2))
215215
require.NoError(t, err)
216216

217217
// Should succeed - empty hash shouldn't trigger grace period.

cmd/createcluster.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -443,7 +443,7 @@ func detectNodeDirs(clusterDir string, nodeAmount int) error {
443443
return errors.Wrap(err, "absolute path retrieval")
444444
}
445445

446-
if _, err := os.Stat(filepath.Join(absPath, "cluster-lock.json")); err == nil {
446+
if _, err := os.Stat(filepath.Join(absPath, clusterLockFile)); err == nil {
447447
return errors.New("existing node directory found, please delete it before running this command", z.Str("node_path", absPath))
448448
}
449449
}

cmd/dkg.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ import (
1515
"github.com/obolnetwork/charon/dkg"
1616
)
1717

18+
var defaultDKGRelays = []string{"https://4.relay.obol.dev"}
19+
1820
func newDKGCmd(runFunc func(context.Context, dkg.Config) error) *cobra.Command {
1921
var config dkg.Config
2022

@@ -50,7 +52,7 @@ this command at the same time.`,
5052
bindKeymanagerFlags(cmd.Flags(), &config.KeymanagerAddr, &config.KeymanagerAuthToken)
5153
bindDefDirFlag(cmd.Flags(), &config.DefFile)
5254
bindNoVerifyFlag(cmd.Flags(), &config.NoVerify)
53-
bindP2PFlags(cmd, &config.P2P)
55+
bindP2PFlags(cmd, &config.P2P, defaultDKGRelays...)
5456
bindLogFlags(cmd.Flags(), &config.Log)
5557
bindPublishFlags(cmd.Flags(), &config)
5658
bindShutdownDelayFlag(cmd.Flags(), &config.ShutdownDelay)

cmd/edit_addoperators.go

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,6 @@ import (
1818
"github.com/obolnetwork/charon/eth2util/enr"
1919
)
2020

21-
const (
22-
defaultAlphaRelay = "https://4.relay.obol.dev"
23-
)
24-
2521
func newAddOperatorsCmd(runFunc func(context.Context, dkg.AddOperatorsConfig, dkg.Config) error) *cobra.Command {
2622
var (
2723
config dkg.AddOperatorsConfig
@@ -52,7 +48,7 @@ func newAddOperatorsCmd(runFunc func(context.Context, dkg.AddOperatorsConfig, dk
5248
cmd.Flags().DurationVar(&dkgConfig.Timeout, "timeout", time.Minute, "Timeout for the protocol, should be increased if protocol times out.")
5349

5450
bindNoVerifyFlag(cmd.Flags(), &dkgConfig.NoVerify)
55-
bindP2PFlags(cmd, &dkgConfig.P2P, defaultAlphaRelay)
51+
bindP2PFlags(cmd, &dkgConfig.P2P, defaultDKGRelays...)
5652
bindLogFlags(cmd.Flags(), &dkgConfig.Log)
5753
bindEth1Flag(cmd.Flags(), &dkgConfig.ExecutionEngineAddr)
5854
bindShutdownDelayFlag(cmd.Flags(), &dkgConfig.ShutdownDelay)

cmd/edit_addvalidators.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ func newAddValidatorsCmd(runFunc func(context.Context, addValidatorsConfig) erro
7373
// Bind `dkg` flags.
7474
bindKeymanagerFlags(cmd.Flags(), &config.DKG.KeymanagerAddr, &config.DKG.KeymanagerAuthToken)
7575
bindNoVerifyFlag(cmd.Flags(), &config.DKG.NoVerify)
76-
bindP2PFlags(cmd, &config.DKG.P2P, defaultAlphaRelay)
76+
bindP2PFlags(cmd, &config.DKG.P2P, defaultDKGRelays...)
7777
bindLogFlags(cmd.Flags(), &config.DKG.Log)
7878
bindShutdownDelayFlag(cmd.Flags(), &config.DKG.ShutdownDelay)
7979
bindEth1Flag(cmd.Flags(), &config.DKG.ExecutionEngineAddr)

cmd/edit_recreateprivatekeys.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ func newRecreatePrivateKeysCmd(runFunc func(context.Context, dkg.ReshareConfig)
4343
cmd.Flags().DurationVar(&config.DKGConfig.Timeout, "timeout", time.Minute, "Timeout for the protocol, should be increased if protocol times out.")
4444

4545
bindNoVerifyFlag(cmd.Flags(), &config.DKGConfig.NoVerify)
46-
bindP2PFlags(cmd, &config.DKGConfig.P2P, defaultAlphaRelay)
46+
bindP2PFlags(cmd, &config.DKGConfig.P2P, defaultDKGRelays...)
4747
bindLogFlags(cmd.Flags(), &config.DKGConfig.Log)
4848
bindEth1Flag(cmd.Flags(), &config.DKGConfig.ExecutionEngineAddr)
4949
bindShutdownDelayFlag(cmd.Flags(), &config.DKGConfig.ShutdownDelay)

cmd/edit_removeoperators.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ func newRemoveOperatorsCmd(runFunc func(context.Context, dkg.RemoveOperatorsConf
5050
cmd.Flags().StringSliceVar(&config.ParticipatingENRs, "participating-operator-enrs", nil, "Comma-separated list of operator ENRs participating in the ceremony. Required if --operator-enrs-to-remove specifies more operators to remove than the fault tolerance of the current cluster.")
5151

5252
bindNoVerifyFlag(cmd.Flags(), &dkgConfig.NoVerify)
53-
bindP2PFlags(cmd, &dkgConfig.P2P, defaultAlphaRelay)
53+
bindP2PFlags(cmd, &dkgConfig.P2P, defaultDKGRelays...)
5454
bindLogFlags(cmd.Flags(), &dkgConfig.Log)
5555
bindEth1Flag(cmd.Flags(), &dkgConfig.ExecutionEngineAddr)
5656
bindShutdownDelayFlag(cmd.Flags(), &dkgConfig.ShutdownDelay)

cmd/edit_replaceoperator.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ func newReplaceOperatorCmd(runFunc func(context.Context, dkg.ReplaceOperatorConf
5151
cmd.Flags().DurationVar(&dkgConfig.Timeout, "timeout", time.Minute, "Timeout for the protocol, should be increased if protocol times out.")
5252

5353
bindNoVerifyFlag(cmd.Flags(), &dkgConfig.NoVerify)
54-
bindP2PFlags(cmd, &dkgConfig.P2P, defaultAlphaRelay)
54+
bindP2PFlags(cmd, &dkgConfig.P2P, defaultDKGRelays...)
5555
bindLogFlags(cmd.Flags(), &dkgConfig.Log)
5656
bindEth1Flag(cmd.Flags(), &dkgConfig.ExecutionEngineAddr)
5757
bindShutdownDelayFlag(cmd.Flags(), &dkgConfig.ShutdownDelay)

cmd/testall.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ func newTestAllCmd(runFunc func(context.Context, io.Writer, testAllConfig) error
4545
bindTestMEVFlags(cmd, &config.MEV, "mev-")
4646
bindTestInfraFlags(cmd, &config.Infra, "infra-")
4747

48-
bindP2PFlags(cmd, &config.Peers.P2P, defaultAlphaRelay)
48+
bindP2PFlags(cmd, &config.Peers.P2P, defaultDKGRelays...)
4949
bindTestLogFlags(cmd.Flags(), &config.Peers.Log)
5050

5151
wrapPreRunE(cmd, func(cmd *cobra.Command, _ []string) error {

0 commit comments

Comments
 (0)