Skip to content

Commit e2b1e1b

Browse files
authored
fix: log added to RDMA device selection assertion failure (#54)
1 parent add98bc commit e2b1e1b

File tree

1 file changed

+15
-7
lines changed

1 file changed

+15
-7
lines changed

checkpoint_engine/ps.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -305,13 +305,21 @@ def _get_my_rdma_device(local_rank: int, gpu_count: int, devices: list[str]) ->
305305
"""
306306
if not devices:
307307
raise RuntimeError("no rdma devices found")
308-
assert len(devices) <= gpu_count, (
309-
f"rdma devices count {len(devices)} should be less than or equal to gpu count {gpu_count}"
310-
)
311-
assert gpu_count % len(devices) == 0, (
312-
f"gpu count {gpu_count} should be divisible by rdma devices count {len(devices)}"
313-
)
314-
return devices[local_rank // (gpu_count // len(devices))]
308+
try:
309+
assert len(devices) <= gpu_count, (
310+
f"rdma devices count {len(devices)} should be less than or equal to gpu count {gpu_count}"
311+
)
312+
assert gpu_count % len(devices) == 0, (
313+
f"gpu count {gpu_count} should be divisible by rdma devices count {len(devices)}"
314+
)
315+
return devices[local_rank // (gpu_count // len(devices))]
316+
except AssertionError:
317+
logger.error(
318+
"Please set 'NCCL_IB_HCA' or 'PS_P2P_STORE_RDMA_DEVICES' environment variable to choose proper number of RDMA devices."
319+
"The number of RDMA devices should be less than or equal to GPU count, and GPU count should be divisible by the number of RDMA devices."
320+
"The acceptable value by NCCL_IB_HCA is documented in 'https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#id8'."
321+
)
322+
raise
315323

316324

317325
def _parse_NCCL_IB_HCA(value: str, available_devices: list[str]) -> list[str]:

0 commit comments

Comments
 (0)