@@ -1041,3 +1041,275 @@ def rccl_perf(
10411041 check_lat_dip (test_name , results_for_verification , test_exp_dict )
10421042
10431043 return all_raw_results
1044+
1045+
1046+ def install_mpi (shdl , install_path , rocm_path , ucx_version = "1.18.0" , ompi_version = "5.0.7" ):
1047+ """Install UCX and OpenMPI for RCCL tests with MPI support.
1048+
1049+ Args:
1050+ shdl: Single SSH handle for executing commands on head node (shared storage)
1051+ install_path (str): Directory where UCX and OpenMPI should be installed
1052+ rocm_path (str): Path to ROCm installation
1053+ ucx_version (str, optional): UCX version to build. Defaults to "1.18.0"
1054+ ompi_version (str, optional): OpenMPI version to build. Defaults to "5.0.7"
1055+
1056+ Returns:
1057+ dict: Paths to UCX and OpenMPI installations
1058+
1059+ Raises:
1060+ Exception: If download, build, or installation fails
1061+ """
1062+ ucx_dir = f"ucx-{ ucx_version } "
1063+ ompi_dir = f"ompi-{ ompi_version } "
1064+ ompi_base_ver = "." .join (ompi_version .split ("." )[:- 1 ]) # e.g., "5.0" from "5.0.7"
1065+
1066+ ucx_install_dir = f"{ install_path } /{ ucx_dir } /install"
1067+ ompi_install_dir = f"{ install_path } /{ ompi_dir } /install"
1068+
1069+ # Require real artifacts, not only prefix directories (partial installs / timeouts can leave empty dirs).
1070+ check_ucx = shdl .exec (
1071+ f"test -x { ucx_install_dir } /bin/ucx_info && echo EXISTS" ,
1072+ timeout = 10 ,
1073+ print_console = False ,
1074+ )
1075+ check_ompi = shdl .exec (
1076+ f"test -x { ompi_install_dir } /bin/mpirun && echo EXISTS" ,
1077+ timeout = 10 ,
1078+ print_console = False ,
1079+ )
1080+
1081+ ucx_installed = any ("EXISTS" in v for v in check_ucx .values ())
1082+ ompi_installed = any ("EXISTS" in v for v in check_ompi .values ())
1083+
1084+ if ucx_installed and ompi_installed :
1085+ log .info ("UCX and OpenMPI already installed in shared storage" )
1086+ # Check if directory links exist, create them if missing
1087+ check_links = shdl .exec (
1088+ f"test -L { install_path } /bin && test -L { install_path } /lib && test -L { install_path } /include && echo LINKS_EXIST" ,
1089+ timeout = 10 ,
1090+ print_console = False ,
1091+ )
1092+ links_exist = any ("LINKS_EXIST" in v for v in check_links .values ())
1093+
1094+ if not links_exist :
1095+ log .info ("[MPI] Directory links missing, creating them..." )
1096+ link_cmd = f"""
1097+ cd { install_path } &&
1098+ ln -sf { ompi_install_dir } /bin bin &&
1099+ ln -sf { ompi_install_dir } /lib lib &&
1100+ ln -sf { ompi_install_dir } /include include
1101+ """
1102+ try :
1103+ shdl .exec (link_cmd , timeout = 30 , print_console = False )
1104+ log .info ("[MPI] Directory soft links created successfully" )
1105+ except Exception as e :
1106+ raise Exception (f"Failed to create MPI directory links: { e } " )
1107+ else :
1108+ log .info ("[MPI] Directory links already exist" )
1109+
1110+ return {"ucx_path" : ucx_install_dir , "ompi_path" : install_path }
1111+
1112+ # Install UCX
1113+ if not ucx_installed :
1114+ log .info (f"[MPI] Installing UCX { ucx_version } ..." )
1115+ ucx_cmd = f"""
1116+ mkdir -p { install_path } &&
1117+ cd { install_path } &&
1118+ rm -rf { ucx_dir } ucx-{ ucx_version } .tar.gz &&
1119+ wget https://github.com/openucx/ucx/releases/download/v{ ucx_version } /ucx-{ ucx_version } .tar.gz &&
1120+ mkdir -p { ucx_dir } &&
1121+ tar -zxf ucx-{ ucx_version } .tar.gz -C { ucx_dir } --strip-components=1 &&
1122+ cd { ucx_dir } &&
1123+ mkdir -p build &&
1124+ cd build &&
1125+ ../configure --prefix={ ucx_install_dir } --with-rocm={ rocm_path } &&
1126+ make -j$(nproc) &&
1127+ make install
1128+ """
1129+ try :
1130+ shdl .exec (ucx_cmd , timeout = 600 , print_console = False )
1131+ except Exception as e :
1132+ raise Exception (f"Failed to install UCX: { e } " )
1133+
1134+ # Install OpenMPI
1135+ if not ompi_installed :
1136+ log .info (f"[MPI] Installing OpenMPI { ompi_version } ..." )
1137+ ompi_cmd = f"""
1138+ mkdir -p { install_path } &&
1139+ cd { install_path } &&
1140+ rm -rf { ompi_dir } openmpi-{ ompi_version } .tar.gz &&
1141+ wget https://download.open-mpi.org/release/open-mpi/v{ ompi_base_ver } /openmpi-{ ompi_version } .tar.gz &&
1142+ mkdir -p { ompi_dir } &&
1143+ tar -zxf openmpi-{ ompi_version } .tar.gz -C { ompi_dir } --strip-components=1 &&
1144+ cd { ompi_dir } &&
1145+ mkdir -p build &&
1146+ cd build &&
1147+ ../configure --prefix={ ompi_install_dir } --with-ucx={ ucx_install_dir } --disable-oshmem --disable-mpi-fortran &&
1148+ make -j$(nproc) &&
1149+ make install
1150+ """
1151+ # Open MPI: configure + make -j + make install often exceeds 15 minutes on shared hosts.
1152+ try :
1153+ shdl .exec (ompi_cmd , timeout = 1200 , print_console = False )
1154+ except Exception as e :
1155+ raise Exception (f"Failed to install OpenMPI: { e } " )
1156+
1157+ # Create directory-level soft links for MPI
1158+ # Structure: /mnt/scratch1/amd/ichristo/mpi/bin -> ompi-5.0.7/install/bin
1159+ link_cmd = f"""
1160+ cd { install_path } &&
1161+ ln -sf { ompi_install_dir } /bin bin &&
1162+ ln -sf { ompi_install_dir } /lib lib &&
1163+ ln -sf { ompi_install_dir } /include include
1164+ """
1165+ log .info ("[MPI] Creating directory soft links for MPI bin, lib, and include..." )
1166+ try :
1167+ shdl .exec (link_cmd , timeout = 30 , print_console = False )
1168+ except Exception as e :
1169+ raise Exception (f"Failed to create MPI soft links: { e } " )
1170+
1171+ # Verify installations (same criteria as skip-if-installed checks above).
1172+ verify_ucx = shdl .exec (f"test -x { ucx_install_dir } /bin/ucx_info && echo EXISTS" , timeout = 10 , print_console = False )
1173+ verify_ompi = shdl .exec (f"test -x { ompi_install_dir } /bin/mpirun && echo EXISTS" , timeout = 10 , print_console = False )
1174+
1175+ ucx_verified = any ("EXISTS" in v for v in verify_ucx .values ())
1176+ ompi_verified = any ("EXISTS" in v for v in verify_ompi .values ())
1177+
1178+ if not ucx_verified :
1179+ raise Exception ("UCX installation verification failed" )
1180+ if not ompi_verified :
1181+ raise Exception ("OpenMPI installation verification failed" )
1182+
1183+ log .info ("[MPI] UCX and OpenMPI installation complete on all nodes" )
1184+ return {"ucx_path" : ucx_install_dir , "ompi_path" : install_path }
1185+
1186+
1187+ def install_rccl_tests (
1188+ shdl ,
1189+ install_path ,
1190+ repo_url = "https://github.com/ROCm/rocm-systems.git" ,
1191+ repo_branch = "develop" ,
1192+ with_mpi = False ,
1193+ mpi_home = None ,
1194+ rccl_home = None ,
1195+ rocm_home = None ,
1196+ ):
1197+ """Install rccl-tests from ROCm/rocm-systems monorepo.
1198+
1199+ Args:
1200+ shdl: Single SSH handle for executing commands on head node (shared storage)
1201+ install_path (str): Directory where rccl-tests should be installed
1202+ repo_url (str, optional): Git repository URL. Defaults to ROCm/rocm-systems
1203+ repo_branch (str, optional): Git branch to clone. Defaults to 'develop'
1204+ with_mpi (bool, optional): Build with MPI support. Defaults to False
1205+ mpi_home (str, optional): Path to MPI installation. Required if with_mpi=True
1206+ rccl_home (str, optional): Path to RCCL library install (e.g. /opt/rocm)
1207+ rocm_home (str, optional): Path to ROCm root for hipcc/amdclang (e.g. /opt/rocm).
1208+ Defaults to rccl_home, then /opt/rocm. Must not be the rccl-tests work dir.
1209+
1210+ Returns:
1211+ str: Path to the install directory containing all *_perf binaries
1212+
1213+ Raises:
1214+ Exception: If clone, build, or verification fails
1215+ """
1216+ rocm = (rocm_home or "/opt/rocm" ).rstrip ("/" )
1217+ if rccl_home is None :
1218+ rccl = rocm
1219+ else :
1220+ rccl = str (rccl_home ).rstrip ("/" )
1221+ if with_mpi and not mpi_home :
1222+ raise ValueError ("mpi_home is required when with_mpi=True" )
1223+ sparse_dir = "projects/rccl-tests"
1224+ clone_root = f"{ install_path } /rocm-systems"
1225+
1226+ # Check if installation already exists (check for any *_perf binary)
1227+ check = shdl .exec (
1228+ f"ls { install_path } /*_perf 2>/dev/null | head -1 | grep -q _perf && echo EXISTS" ,
1229+ timeout = 10 ,
1230+ print_console = False ,
1231+ )
1232+ if any ("EXISTS" in v for v in check .values ()):
1233+ log .info (f"rccl-tests binaries already present in shared storage: { install_path } " )
1234+ return install_path
1235+
1236+ # Clone rocm-systems repository
1237+ clone_cmd = (
1238+ f"mkdir -p { install_path } && "
1239+ f"rm -rf { clone_root } && "
1240+ f"git clone --depth 1 --filter=blob:none --sparse "
1241+ f"-b { repo_branch } { repo_url } { clone_root } && "
1242+ f"cd { clone_root } && git sparse-checkout set { sparse_dir } "
1243+ )
1244+ log .info (f"[rccl-tests] Cloning { sparse_dir } from rocm-systems to shared storage..." )
1245+ try :
1246+ shdl .exec (clone_cmd , timeout = 180 , print_console = False )
1247+ except Exception as e :
1248+ raise Exception (f"Failed to clone rccl-tests: { e } " )
1249+
1250+ # Upstream rccl-tests Makefile sets rpath to $(NCCL_HOME) (install root), but
1251+ # shared libraries live under $(NCCL_HOME)/lib. Patch rpath so binaries resolve
1252+ # against the configured ROCm install instead of system /opt/rocm.
1253+ patch_makefile_cmd = (
1254+ f"cd { clone_root } /{ sparse_dir } /src && "
1255+ f"sed -i 's|-Wl,-rpath,$(NCCL_HOME)|-Wl,-rpath,$(NCCL_HOME)/lib -Wl,-rpath,$(ROCM_PATH)/lib|g' Makefile"
1256+ )
1257+ log .info ("[rccl-tests] Patching Makefile rpath to prefer configured ROCm lib directories..." )
1258+ try :
1259+ shdl .exec (patch_makefile_cmd , timeout = 30 , print_console = False )
1260+ except Exception as e :
1261+ raise Exception (f"Failed to patch rccl-tests Makefile rpath: { e } " )
1262+
1263+ # Build rccl-tests using install.sh script
1264+ gpu_target = f"$({ rocm } /bin/rocm_agent_enumerator | grep -v gfx000 | head -1)"
1265+
1266+ hip_compiler = f"{ rocm } /llvm/bin/amdclang++"
1267+
1268+ if with_mpi :
1269+ build_cmd = (
1270+ f"cd { clone_root } /{ sparse_dir } && "
1271+ f"ROCM_PATH={ rocm } "
1272+ f"./install.sh --mpi --mpi_home { mpi_home } --rccl_home { rccl } --rocm_home { rocm } "
1273+ f"--hip_compiler { hip_compiler } --gpu_targets { gpu_target } "
1274+ )
1275+ log .info ("[rccl-tests] Building rccl-tests with MPI support using install.sh in shared storage..." )
1276+ else :
1277+ build_cmd = (
1278+ f"cd { clone_root } /{ sparse_dir } && "
1279+ f"ROCM_PATH={ rocm } "
1280+ f"./install.sh --rccl_home { rccl } --rocm_home { rocm } "
1281+ f"--hip_compiler { hip_compiler } --gpu_targets { gpu_target } "
1282+ )
1283+ log .info ("[rccl-tests] Building rccl-tests without MPI using install.sh in shared storage..." )
1284+
1285+ try :
1286+ shdl .exec (build_cmd , timeout = 300 , print_console = False )
1287+ except Exception as e :
1288+ raise Exception (f"Failed to build rccl-tests: { e } " )
1289+
1290+ # Create soft links for all *_perf binaries
1291+ link_cmd = (
1292+ f"cd { install_path } && "
1293+ f"for perf_binary in { clone_root } /{ sparse_dir } /build/*_perf; do "
1294+ f" if [ -f \" $perf_binary\" ]; then "
1295+ f" ln -sf \" $perf_binary\" \" $(basename \" $perf_binary\" )\" ; "
1296+ f" fi; "
1297+ f"done"
1298+ )
1299+ log .info ("[rccl-tests] Creating soft links for *_perf binaries..." )
1300+ try :
1301+ shdl .exec (link_cmd , timeout = 30 , print_console = False )
1302+ except Exception as e :
1303+ raise Exception (f"Failed to create soft links: { e } " )
1304+
1305+ # Verify installation (check for any *_perf binary)
1306+ verify = shdl .exec (
1307+ f"ls { install_path } /*_perf 2>/dev/null | head -1 | grep -q _perf && echo EXISTS" ,
1308+ timeout = 10 ,
1309+ print_console = False ,
1310+ )
1311+ if not any ("EXISTS" in v for v in verify .values ()):
1312+ raise Exception ("rccl-tests build completed but no *_perf binaries found in shared storage" )
1313+
1314+ log .info (f"[rccl-tests] Build complete in shared storage: { install_path } " )
1315+ return install_path
0 commit comments