Skip to content

Commit e14c728

Browse files
committed
Add MPI/rccl-tests installation support
- Add install_mpi() function to rccl_lib.py for UCX and OpenMPI installation - Add install_rccl_tests() function with MPI support and soft link creation - Add install_rccl_test.py for automated RCCL tests installation Made-with: Cursor Signed-off-by: Ignatious Johnson <ichristo@amd.com>
1 parent 9ff9896 commit e14c728

3 files changed

Lines changed: 425 additions & 0 deletions

File tree

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
{
2+
"_comment": "Minimal RCCL JSON for cvs/tests/rccl/install_rccl_test.py only. Adjust paths for your cluster shared storage and ROCm layout.",
3+
"rccl": {
4+
"_comment_rccl_tests_dir": "Where sparse checkout is built and *_perf symlinks are created (must be visible on all nodes if shared filesystem).",
5+
"rccl_tests_dir": "/opt/rccl-tests-install",
6+
7+
"_comment_rccl_tests_repo": "Mono-repo containing projects/rccl-tests (sparse clone).",
8+
"rccl_tests_repo": "https://github.com/ROCm/rocm-systems.git",
9+
10+
"_comment_rccl_tests_branch": "Branch passed to git clone -b.",
11+
"rccl_tests_branch": "develop",
12+
13+
"_comment_rocm_path_var": "ROCm root for hipcc/rocm_agent_enumerator and UCX --with-rocm. install.sh --rocm_home.",
14+
"rocm_path_var": "/opt/rocm",
15+
16+
"_comment_rccl_path_var": "RCCL install prefix for install.sh --rccl_home. Often same as rocm_path_var when RCCL is under ROCm.",
17+
"rccl_path_var": "/opt/rocm",
18+
19+
"_comment_mpi_path_var": "Open MPI prefix (directory containing bin/mpirun). Required for multi-node runs (num_nodes>1). If mpirun is missing here, UCX/OpenMPI are built into this path. Single-node: may be omitted.",
20+
"mpi_path_var": "/usr",
21+
22+
"_comment_ucx_version": "Used only when MPI is auto-installed (multi-node and mpirun missing at mpi_path_var).",
23+
"ucx_version": "1.18.0",
24+
25+
"_comment_ompi_version": "Used only when MPI is auto-installed.",
26+
"ompi_version": "5.0.7"
27+
}
28+
}

cvs/lib/rccl_lib.py

Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1041,3 +1041,275 @@ def rccl_perf(
10411041
check_lat_dip(test_name, results_for_verification, test_exp_dict)
10421042

10431043
return all_raw_results
1044+
1045+
1046+
def install_mpi(shdl, install_path, rocm_path, ucx_version="1.18.0", ompi_version="5.0.7"):
1047+
"""Install UCX and OpenMPI for RCCL tests with MPI support.
1048+
1049+
Args:
1050+
shdl: Single SSH handle for executing commands on head node (shared storage)
1051+
install_path (str): Directory where UCX and OpenMPI should be installed
1052+
rocm_path (str): Path to ROCm installation
1053+
ucx_version (str, optional): UCX version to build. Defaults to "1.18.0"
1054+
ompi_version (str, optional): OpenMPI version to build. Defaults to "5.0.7"
1055+
1056+
Returns:
1057+
dict: Paths to UCX and OpenMPI installations
1058+
1059+
Raises:
1060+
Exception: If download, build, or installation fails
1061+
"""
1062+
ucx_dir = f"ucx-{ucx_version}"
1063+
ompi_dir = f"ompi-{ompi_version}"
1064+
ompi_base_ver = ".".join(ompi_version.split(".")[:-1]) # e.g., "5.0" from "5.0.7"
1065+
1066+
ucx_install_dir = f"{install_path}/{ucx_dir}/install"
1067+
ompi_install_dir = f"{install_path}/{ompi_dir}/install"
1068+
1069+
# Require real artifacts, not only prefix directories (partial installs / timeouts can leave empty dirs).
1070+
check_ucx = shdl.exec(
1071+
f"test -x {ucx_install_dir}/bin/ucx_info && echo EXISTS",
1072+
timeout=10,
1073+
print_console=False,
1074+
)
1075+
check_ompi = shdl.exec(
1076+
f"test -x {ompi_install_dir}/bin/mpirun && echo EXISTS",
1077+
timeout=10,
1078+
print_console=False,
1079+
)
1080+
1081+
ucx_installed = any("EXISTS" in v for v in check_ucx.values())
1082+
ompi_installed = any("EXISTS" in v for v in check_ompi.values())
1083+
1084+
if ucx_installed and ompi_installed:
1085+
log.info("UCX and OpenMPI already installed in shared storage")
1086+
# Check if directory links exist, create them if missing
1087+
check_links = shdl.exec(
1088+
f"test -L {install_path}/bin && test -L {install_path}/lib && test -L {install_path}/include && echo LINKS_EXIST",
1089+
timeout=10,
1090+
print_console=False,
1091+
)
1092+
links_exist = any("LINKS_EXIST" in v for v in check_links.values())
1093+
1094+
if not links_exist:
1095+
log.info("[MPI] Directory links missing, creating them...")
1096+
link_cmd = f"""
1097+
cd {install_path} &&
1098+
ln -sf {ompi_install_dir}/bin bin &&
1099+
ln -sf {ompi_install_dir}/lib lib &&
1100+
ln -sf {ompi_install_dir}/include include
1101+
"""
1102+
try:
1103+
shdl.exec(link_cmd, timeout=30, print_console=False)
1104+
log.info("[MPI] Directory soft links created successfully")
1105+
except Exception as e:
1106+
raise Exception(f"Failed to create MPI directory links: {e}")
1107+
else:
1108+
log.info("[MPI] Directory links already exist")
1109+
1110+
return {"ucx_path": ucx_install_dir, "ompi_path": install_path}
1111+
1112+
# Install UCX
1113+
if not ucx_installed:
1114+
log.info(f"[MPI] Installing UCX {ucx_version}...")
1115+
ucx_cmd = f"""
1116+
mkdir -p {install_path} &&
1117+
cd {install_path} &&
1118+
rm -rf {ucx_dir} ucx-{ucx_version}.tar.gz &&
1119+
wget https://github.com/openucx/ucx/releases/download/v{ucx_version}/ucx-{ucx_version}.tar.gz &&
1120+
mkdir -p {ucx_dir} &&
1121+
tar -zxf ucx-{ucx_version}.tar.gz -C {ucx_dir} --strip-components=1 &&
1122+
cd {ucx_dir} &&
1123+
mkdir -p build &&
1124+
cd build &&
1125+
../configure --prefix={ucx_install_dir} --with-rocm={rocm_path} &&
1126+
make -j$(nproc) &&
1127+
make install
1128+
"""
1129+
try:
1130+
shdl.exec(ucx_cmd, timeout=600, print_console=False)
1131+
except Exception as e:
1132+
raise Exception(f"Failed to install UCX: {e}")
1133+
1134+
# Install OpenMPI
1135+
if not ompi_installed:
1136+
log.info(f"[MPI] Installing OpenMPI {ompi_version}...")
1137+
ompi_cmd = f"""
1138+
mkdir -p {install_path} &&
1139+
cd {install_path} &&
1140+
rm -rf {ompi_dir} openmpi-{ompi_version}.tar.gz &&
1141+
wget https://download.open-mpi.org/release/open-mpi/v{ompi_base_ver}/openmpi-{ompi_version}.tar.gz &&
1142+
mkdir -p {ompi_dir} &&
1143+
tar -zxf openmpi-{ompi_version}.tar.gz -C {ompi_dir} --strip-components=1 &&
1144+
cd {ompi_dir} &&
1145+
mkdir -p build &&
1146+
cd build &&
1147+
../configure --prefix={ompi_install_dir} --with-ucx={ucx_install_dir} --disable-oshmem --disable-mpi-fortran &&
1148+
make -j$(nproc) &&
1149+
make install
1150+
"""
1151+
# Open MPI: configure + make -j + make install often exceeds 15 minutes on shared hosts.
1152+
try:
1153+
shdl.exec(ompi_cmd, timeout=1200, print_console=False)
1154+
except Exception as e:
1155+
raise Exception(f"Failed to install OpenMPI: {e}")
1156+
1157+
# Create directory-level soft links for MPI
1158+
# Structure: /mnt/scratch1/amd/ichristo/mpi/bin -> ompi-5.0.7/install/bin
1159+
link_cmd = f"""
1160+
cd {install_path} &&
1161+
ln -sf {ompi_install_dir}/bin bin &&
1162+
ln -sf {ompi_install_dir}/lib lib &&
1163+
ln -sf {ompi_install_dir}/include include
1164+
"""
1165+
log.info("[MPI] Creating directory soft links for MPI bin, lib, and include...")
1166+
try:
1167+
shdl.exec(link_cmd, timeout=30, print_console=False)
1168+
except Exception as e:
1169+
raise Exception(f"Failed to create MPI soft links: {e}")
1170+
1171+
# Verify installations (same criteria as skip-if-installed checks above).
1172+
verify_ucx = shdl.exec(f"test -x {ucx_install_dir}/bin/ucx_info && echo EXISTS", timeout=10, print_console=False)
1173+
verify_ompi = shdl.exec(f"test -x {ompi_install_dir}/bin/mpirun && echo EXISTS", timeout=10, print_console=False)
1174+
1175+
ucx_verified = any("EXISTS" in v for v in verify_ucx.values())
1176+
ompi_verified = any("EXISTS" in v for v in verify_ompi.values())
1177+
1178+
if not ucx_verified:
1179+
raise Exception("UCX installation verification failed")
1180+
if not ompi_verified:
1181+
raise Exception("OpenMPI installation verification failed")
1182+
1183+
log.info("[MPI] UCX and OpenMPI installation complete on all nodes")
1184+
return {"ucx_path": ucx_install_dir, "ompi_path": install_path}
1185+
1186+
1187+
def install_rccl_tests(
1188+
shdl,
1189+
install_path,
1190+
repo_url="https://github.com/ROCm/rocm-systems.git",
1191+
repo_branch="develop",
1192+
with_mpi=False,
1193+
mpi_home=None,
1194+
rccl_home=None,
1195+
rocm_home=None,
1196+
):
1197+
"""Install rccl-tests from ROCm/rocm-systems monorepo.
1198+
1199+
Args:
1200+
shdl: Single SSH handle for executing commands on head node (shared storage)
1201+
install_path (str): Directory where rccl-tests should be installed
1202+
repo_url (str, optional): Git repository URL. Defaults to ROCm/rocm-systems
1203+
repo_branch (str, optional): Git branch to clone. Defaults to 'develop'
1204+
with_mpi (bool, optional): Build with MPI support. Defaults to False
1205+
mpi_home (str, optional): Path to MPI installation. Required if with_mpi=True
1206+
rccl_home (str, optional): Path to RCCL library install (e.g. /opt/rocm)
1207+
rocm_home (str, optional): Path to ROCm root for hipcc/amdclang (e.g. /opt/rocm).
1208+
Defaults to rccl_home, then /opt/rocm. Must not be the rccl-tests work dir.
1209+
1210+
Returns:
1211+
str: Path to the install directory containing all *_perf binaries
1212+
1213+
Raises:
1214+
Exception: If clone, build, or verification fails
1215+
"""
1216+
rocm = (rocm_home or "/opt/rocm").rstrip("/")
1217+
if rccl_home is None:
1218+
rccl = rocm
1219+
else:
1220+
rccl = str(rccl_home).rstrip("/")
1221+
if with_mpi and not mpi_home:
1222+
raise ValueError("mpi_home is required when with_mpi=True")
1223+
sparse_dir = "projects/rccl-tests"
1224+
clone_root = f"{install_path}/rocm-systems"
1225+
1226+
# Check if installation already exists (check for any *_perf binary)
1227+
check = shdl.exec(
1228+
f"ls {install_path}/*_perf 2>/dev/null | head -1 | grep -q _perf && echo EXISTS",
1229+
timeout=10,
1230+
print_console=False,
1231+
)
1232+
if any("EXISTS" in v for v in check.values()):
1233+
log.info(f"rccl-tests binaries already present in shared storage: {install_path}")
1234+
return install_path
1235+
1236+
# Clone rocm-systems repository
1237+
clone_cmd = (
1238+
f"mkdir -p {install_path} && "
1239+
f"rm -rf {clone_root} && "
1240+
f"git clone --depth 1 --filter=blob:none --sparse "
1241+
f"-b {repo_branch} {repo_url} {clone_root} && "
1242+
f"cd {clone_root} && git sparse-checkout set {sparse_dir}"
1243+
)
1244+
log.info(f"[rccl-tests] Cloning {sparse_dir} from rocm-systems to shared storage...")
1245+
try:
1246+
shdl.exec(clone_cmd, timeout=180, print_console=False)
1247+
except Exception as e:
1248+
raise Exception(f"Failed to clone rccl-tests: {e}")
1249+
1250+
# Upstream rccl-tests Makefile sets rpath to $(NCCL_HOME) (install root), but
1251+
# shared libraries live under $(NCCL_HOME)/lib. Patch rpath so binaries resolve
1252+
# against the configured ROCm install instead of system /opt/rocm.
1253+
patch_makefile_cmd = (
1254+
f"cd {clone_root}/{sparse_dir}/src && "
1255+
f"sed -i 's|-Wl,-rpath,$(NCCL_HOME)|-Wl,-rpath,$(NCCL_HOME)/lib -Wl,-rpath,$(ROCM_PATH)/lib|g' Makefile"
1256+
)
1257+
log.info("[rccl-tests] Patching Makefile rpath to prefer configured ROCm lib directories...")
1258+
try:
1259+
shdl.exec(patch_makefile_cmd, timeout=30, print_console=False)
1260+
except Exception as e:
1261+
raise Exception(f"Failed to patch rccl-tests Makefile rpath: {e}")
1262+
1263+
# Build rccl-tests using install.sh script
1264+
gpu_target = f"$({rocm}/bin/rocm_agent_enumerator | grep -v gfx000 | head -1)"
1265+
1266+
hip_compiler = f"{rocm}/llvm/bin/amdclang++"
1267+
1268+
if with_mpi:
1269+
build_cmd = (
1270+
f"cd {clone_root}/{sparse_dir} && "
1271+
f"ROCM_PATH={rocm} "
1272+
f"./install.sh --mpi --mpi_home {mpi_home} --rccl_home {rccl} --rocm_home {rocm} "
1273+
f"--hip_compiler {hip_compiler} --gpu_targets {gpu_target}"
1274+
)
1275+
log.info("[rccl-tests] Building rccl-tests with MPI support using install.sh in shared storage...")
1276+
else:
1277+
build_cmd = (
1278+
f"cd {clone_root}/{sparse_dir} && "
1279+
f"ROCM_PATH={rocm} "
1280+
f"./install.sh --rccl_home {rccl} --rocm_home {rocm} "
1281+
f"--hip_compiler {hip_compiler} --gpu_targets {gpu_target}"
1282+
)
1283+
log.info("[rccl-tests] Building rccl-tests without MPI using install.sh in shared storage...")
1284+
1285+
try:
1286+
shdl.exec(build_cmd, timeout=300, print_console=False)
1287+
except Exception as e:
1288+
raise Exception(f"Failed to build rccl-tests: {e}")
1289+
1290+
# Create soft links for all *_perf binaries
1291+
link_cmd = (
1292+
f"cd {install_path} && "
1293+
f"for perf_binary in {clone_root}/{sparse_dir}/build/*_perf; do "
1294+
f" if [ -f \"$perf_binary\" ]; then "
1295+
f" ln -sf \"$perf_binary\" \"$(basename \"$perf_binary\")\"; "
1296+
f" fi; "
1297+
f"done"
1298+
)
1299+
log.info("[rccl-tests] Creating soft links for *_perf binaries...")
1300+
try:
1301+
shdl.exec(link_cmd, timeout=30, print_console=False)
1302+
except Exception as e:
1303+
raise Exception(f"Failed to create soft links: {e}")
1304+
1305+
# Verify installation (check for any *_perf binary)
1306+
verify = shdl.exec(
1307+
f"ls {install_path}/*_perf 2>/dev/null | head -1 | grep -q _perf && echo EXISTS",
1308+
timeout=10,
1309+
print_console=False,
1310+
)
1311+
if not any("EXISTS" in v for v in verify.values()):
1312+
raise Exception("rccl-tests build completed but no *_perf binaries found in shared storage")
1313+
1314+
log.info(f"[rccl-tests] Build complete in shared storage: {install_path}")
1315+
return install_path

0 commit comments

Comments
 (0)