Skip to content

Commit b68b225

Browse files
committed
fix(cvs exec): pass logger to Pssh instead of None
Pssh.__init__ calls log.debug immediately; log=None caused 'NoneType' object has no attribute 'debug' when running cvs exec. Made-with: Cursor Signed-off-by: Ignatious Johnson <ichristo@amd.com>
1 parent 0978591 commit b68b225

2 files changed

Lines changed: 19 additions & 13 deletions

File tree

cvs/lib/rccl_lib.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1260,6 +1260,7 @@ def install_rccl_tests(
12601260
with_mpi=False,
12611261
mpi_home=None,
12621262
rccl_home=None,
1263+
rocm_home=None,
12631264
):
12641265
"""Install rccl-tests from ROCm/rocm-systems monorepo.
12651266
@@ -1270,16 +1271,23 @@ def install_rccl_tests(
12701271
repo_branch (str, optional): Git branch to clone. Defaults to 'develop'
12711272
with_mpi (bool, optional): Build with MPI support. Defaults to False
12721273
mpi_home (str, optional): Path to MPI installation. Required if with_mpi=True
1273-
rccl_home (str, optional): Path to RCCL installation. Required if with_mpi=True
1274+
rccl_home (str, optional): Path to RCCL library install (e.g. /opt/rocm)
1275+
rocm_home (str, optional): Path to ROCm root for hipcc/amdclang (e.g. /opt/rocm).
1276+
Defaults to rccl_home, then /opt/rocm. Must not be the rccl-tests work dir.
12741277
12751278
Returns:
12761279
str: Path to the install directory containing all *_perf binaries
12771280
12781281
Raises:
12791282
Exception: If clone, build, or verification fails
12801283
"""
1281-
if with_mpi and (not mpi_home or not rccl_home):
1282-
raise ValueError("mpi_home and rccl_home are required when with_mpi=True")
1284+
rocm = (rocm_home or "/opt/rocm").rstrip("/")
1285+
if rccl_home is None:
1286+
rccl = rocm
1287+
else:
1288+
rccl = str(rccl_home).rstrip("/")
1289+
if with_mpi and not mpi_home:
1290+
raise ValueError("mpi_home is required when with_mpi=True")
12831291
sparse_dir = "projects/rccl-tests"
12841292
clone_root = f"{install_path}/rocm-systems"
12851293

@@ -1308,13 +1316,13 @@ def install_rccl_tests(
13081316
raise Exception(f"Failed to clone rccl-tests: {e}")
13091317

13101318
# Build rccl-tests using install.sh script
1311-
gpu_target = "$(/opt/rocm/bin/rocm_agent_enumerator | grep -v gfx000 | head -1)"
1319+
gpu_target = f"$({rocm}/bin/rocm_agent_enumerator | grep -v gfx000 | head -1)"
13121320

13131321
if with_mpi:
1314-
build_cmd = f"cd {clone_root}/{sparse_dir} && ./install.sh --mpi --mpi_home {mpi_home} --rccl_home {rccl_home} --rocm_home {rccl_home} --gpu_targets {gpu_target}"
1322+
build_cmd = f"cd {clone_root}/{sparse_dir} && ./install.sh --mpi --mpi_home {mpi_home} --rccl_home {rccl} --rocm_home {rocm} --gpu_targets {gpu_target}"
13151323
log.info("[rccl-tests] Building rccl-tests with MPI support using install.sh in shared storage...")
13161324
else:
1317-
build_cmd = f"cd {clone_root}/{sparse_dir} && ./install.sh --rccl_home {rccl_home} --rocm_home {rccl_home} --gpu_targets {gpu_target}"
1325+
build_cmd = f"cd {clone_root}/{sparse_dir} && ./install.sh --rccl_home {rccl} --rocm_home {rocm} --gpu_targets {gpu_target}"
13181326
log.info("[rccl-tests] Building rccl-tests without MPI using install.sh in shared storage...")
13191327

13201328
try:

cvs/tests/rccl/install_rccl_test.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,9 @@ def test_build_rccl_tests(shdl, cluster_dict, config_dict):
6868
num_nodes = len(all_nodes)
6969
with_mpi = num_nodes > 1 # Automatically enable MPI for multi-node setups
7070
mpi_home = rccl_cfg.get("mpi_path_var") # Use existing mpi_path_var
71-
rccl_home = rccl_cfg.get("rccl_dir") # Use existing rccl_dir
72-
rocm_path = rccl_cfg.get("rocm_path_var", "/opt/rocm")
71+
# install.sh --rccl_home / --rocm_home must point at the ROCm/RCCL *install* (e.g. /opt/rocm), not rccl_dir (work tree).
72+
rocm_path = (rccl_cfg.get("rocm_path_var") or "/opt/rocm").rstrip("/")
73+
rccl_path = (rccl_cfg.get("rccl_path_var") or rocm_path).rstrip("/")
7374

7475
log.info(f"[rccl-tests] Detected {num_nodes} node(s), MPI support: {'enabled' if with_mpi else 'disabled'}")
7576

@@ -98,18 +99,15 @@ def test_build_rccl_tests(shdl, cluster_dict, config_dict):
9899
elif with_mpi and mpi_exists:
99100
log.info(f"[rccl-tests] Using existing MPI installation at: {mpi_home}")
100101

101-
# Use ROCm path as RCCL home if not specified
102-
if with_mpi and not rccl_home:
103-
rccl_home = rocm_path
104-
105102
install_dir = rccl_lib.install_rccl_tests(
106103
shdl=shdl,
107104
install_path=install_path,
108105
repo_url=repo_url,
109106
repo_branch=repo_branch,
110107
with_mpi=with_mpi,
111108
mpi_home=mpi_home,
112-
rccl_home=rccl_home,
109+
rccl_home=rccl_path,
110+
rocm_home=rocm_path,
113111
)
114112

115113
mpi_status = "with MPI" if with_mpi else "without MPI"

0 commit comments

Comments
 (0)