Skip to content

Commit e946f47

Browse files
authored
Fix test_update_rollback_failure test by disabling the pcluster-check-update.timer on the CN instead of cfn-hup (#7200)
* Fix test_update_rollback_failure test by disabling the pcluster-check-update.timer on the CN instead of cfn-hup * Fix tox * Exclude auto generated directory from being checked by tox
1 parent c3398ca commit e946f47

File tree

4 files changed

+23
-27
lines changed

4 files changed

+23
-27
lines changed

cli/src/pcluster/cli/commands/ssh.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,15 +79,13 @@ class SshCommand(CliCommand):
7979
"Run ssh command with the cluster username and IP address pre-populated. "
8080
"Arbitrary arguments are appended to the end of the ssh command."
8181
)
82-
epilog = textwrap.dedent(
83-
"""Example:
82+
epilog = textwrap.dedent("""Example:
8483
8584
pcluster ssh --cluster-name mycluster -i ~/.ssh/id_rsa
8685
8786
Returns an ssh command with the cluster username and IP address pre-populated:
8887
89-
ssh ec2-user@1.1.1.1 -i ~/.ssh/id_rsa"""
90-
)
88+
ssh ec2-user@1.1.1.1 -i ~/.ssh/id_rsa""")
9189

9290
def __init__(self, subparsers):
9391
super().__init__(

cli/tests/pcluster/models/test_s3_bucket.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -245,15 +245,13 @@ def test_get_resource_url(region, bucket_name, cluster_name, resource_name, expe
245245
"B": {"B1": "M"},
246246
},
247247
S3FileFormat.YAML,
248-
textwrap.dedent(
249-
"""\
248+
textwrap.dedent("""\
250249
A:
251250
A1: X
252251
A2: Y
253252
B:
254253
B1: M
255-
"""
256-
),
254+
"""),
257255
),
258256
(
259257
{

cli/tox.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ deps =
129129
commands =
130130
bandit -r \
131131
-c .bandit.ini \
132-
--exclude ../tests,tests,../cloudformation/tests \
132+
--exclude ../tests,tests,../cloudformation/tests,src/pcluster/api/models \
133133
{[vars]code_dirs} \
134134
{posargs}
135135

tests/integration-tests/tests/update/test_update_rollback_failure.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -104,10 +104,10 @@ def test_update_rollback_failure(
104104
logger.info("Injecting cfn-signal failure on head node...")
105105
_inject_cfn_signal_failure(remote_command_executor)
106106

107-
# Step 3: Disable cfn-hup on CN1 BEFORE update
107+
# Step 3: Disable pcluster-check-update timer on CN1 BEFORE update
108108
# This ensures CN1 won't apply the update, causing cluster readiness check to fail
109-
logger.info(f"Disabling cfn-hup on CN1 ({cn1}) before update...")
110-
_disable_cfn_hup_on_compute_node(remote_command_executor, cn1)
109+
logger.info(f"Disabling the pcluster-check-update timer on CN1 ({cn1}) before update...")
110+
_disable_check_update_timer_on_compute_node(remote_command_executor, cn1)
111111

112112
# Step 4: Trigger cluster update with wait=False (non-blocking)
113113
logger.info("Triggering cluster update (non-blocking)...")
@@ -126,8 +126,11 @@ def test_update_rollback_failure(
126126
region, cluster.name, cn2_instance_id, initial_config_version, timeout_minutes=15
127127
)
128128

129-
logger.info(f"CN2 has applied the update. Disabling cfn-hup on CN2 ({cn2}) to inject rollback failure...")
130-
_disable_cfn_hup_on_compute_node(remote_command_executor, cn2)
129+
logger.info(
130+
f"CN2 has applied the update. Disabling pcluster-check-update timer on CN2 "
131+
f"({cn2}) to inject rollback failure..."
132+
)
133+
_disable_check_update_timer_on_compute_node(remote_command_executor, cn2)
131134

132135
# Wait for stack to reach UPDATE_ROLLBACK_COMPLETE state
133136
logger.info("Waiting for stack to reach UPDATE_ROLLBACK_COMPLETE...")
@@ -269,27 +272,24 @@ def _inject_cfn_signal_failure(remote_command_executor):
269272
logger.info("cfn-signal wrapper installed")
270273

271274

272-
def _disable_cfn_hup_on_compute_node(remote_command_executor, node_name):
275+
def _disable_check_update_timer_on_compute_node(remote_command_executor, node_name):
273276
"""
274-
Disable cfn-hup on a compute node using srun.
277+
Disable pcluster-check-update on a compute node using srun.
275278
276-
Uses supervisorctl to stop cfn-hup service on the compute node.
279+
Uses systemctl to stop the pcluster-check-update.timer on the compute node.
277280
"""
278-
logger.info(f"Disabling cfn-hup on compute node {node_name}...")
279-
280-
supervisorctl_path = _get_supervisorctl_path(remote_command_executor)
281+
logger.info(f"Disabling pcluster-check-update on compute node {node_name}...")
281282

282-
# Stop cfn-hup using srun
283-
remote_command_executor.run_remote_command(f"srun -w {node_name} sudo {supervisorctl_path} stop cfn-hup")
283+
# Stop pcluster-check-update.timer using srun
284+
remote_command_executor.run_remote_command(f"srun -w {node_name} sudo systemctl stop pcluster-check-update.timer")
284285

285-
# Verify cfn-hup is stopped
286-
# Note: supervisorctl status returns exit code 3 when process is STOPPED, so we use raise_on_error=False
286+
# Verify pcluster-check-update.timer is stopped
287287
result = remote_command_executor.run_remote_command(
288-
f"srun -w {node_name} sudo {supervisorctl_path} status cfn-hup",
288+
f"srun -w {node_name} systemctl is-active pcluster-check-update.timer",
289289
raise_on_error=False,
290290
)
291-
assert_that(result.stdout).contains("STOPPED")
292-
logger.info(f"cfn-hup stopped on {node_name} ✓")
291+
assert_that(result.stdout.strip()).contains("inactive")
292+
logger.info(f"pcluster-check-update.timer stopped on {node_name} ✓")
293293

294294

295295
@retry(wait_fixed=seconds(30), stop_max_delay=minutes(60))

0 commit comments

Comments
 (0)