diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx.sh index 386477936..d2f63bb86 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx.sh +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx.sh @@ -2,7 +2,7 @@ # must be run as sudo -set -eux +set -eux # FSx Lustre Endpoints FSX_DNS_NAME="$1" @@ -42,6 +42,117 @@ print_lustre_version() modinfo lustre | grep 'version:' | head -n 1 | awk '{print $2}' } +# Verify if FSxL is created with EFA-enabled and if the FS is in the same AZ (cross AZ is not supported) +verify_fsx_efa_compatibility() +{ + local fsx_dns_name="$1" + + echo "[INFO] Verifying FSx EFA compatibility" + + # Extract FSx filesystem ID from DNS name + local fsx_id=$(echo "$fsx_dns_name" | cut -d'.' -f1) + + # Get instance AZ + local instance_az + TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" -s --max-time 3 2>/dev/null) + if [[ -n "$TOKEN" ]]; then + instance_az=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s --max-time 3 http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null) + else + instance_az=$(curl -s --max-time 3 http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null) + fi + + if [[ -z "$instance_az" ]]; then + echo "[WARN] Could not determine instance AZ - proceeding without EFA verification" + return 1 + fi + + # Get FSx filesystem details (EFA and Subnet details) + local fsx_info + if ! fsx_info=$(aws fsx describe-file-systems --file-system-ids "$fsx_id" --query 'FileSystems[0].{LustreConfiguration: LustreConfiguration, SubnetIds: SubnetIds}' --output json 2>/dev/null); then + echo "[WARN] Could not describe FSx filesystem - proceeding without EFA verification" + return 1 + fi + + # Get FSx AZ from subnet (To match FSx and instance AZ) + local fsx_subnet=$(echo "$fsx_info" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data['SubnetIds'][0])" 2>/dev/null) + + if [[ -z "$fsx_subnet" ]]; then + echo "[WARN] Could not determine FSx subnet - proceeding without EFA verification" + return 1 + fi + + local fsx_az=$(aws ec2 describe-subnets --subnet-ids "$fsx_subnet" --query 'Subnets[0].AvailabilityZone' --output text 2>/dev/null) + + if [[ "$instance_az" != "$fsx_az" ]]; then + echo "[INFO] FSx filesystem is in different AZ ($fsx_az vs $instance_az) - EFA not supported cross-AZ" + return 1 + fi + + # Check if FSx has EFA enabled (checking for EfaEnabled field and value. Currently, as observed, if FSx is created without EFA, the field doesn't exist in the describe call) + local efa_enabled=$(echo "$fsx_info" | python3 -c "import sys, json; data=json.load(sys.stdin); lustre_config=data.get('LustreConfiguration', {}); print('FieldNotPresent' if 'EfaEnabled' not in lustre_config else lustre_config['EfaEnabled'])" 2>/dev/null) + + if [[ "$efa_enabled" != "True" ]]; then + if [[ "$efa_enabled" == "FieldNotPresent" ]]; then + echo "[INFO] FSx filesystem was not created with EFA enabled - skipping EFA configuration" + else + echo "[INFO] FSx filesystem has EFA disabled (EfaEnabled: $efa_enabled) - skipping EFA configuration" + fi + return 1 + fi + + echo "[INFO] FSx filesystem is EFA-compatible (same AZ: $instance_az, EfaEnabled: true)" + return 0 +} + +# Configure EFA for Lustre if supported +configure_efa_lustre() +{ + echo "[INFO] Configuring EFA for FSx Lustre" + + # Check if instance has EFA drivers installed and configured + if [[ -x "/opt/amazon/efa/bin/fi_info" ]]; then + if /opt/amazon/efa/bin/fi_info -p efa >/dev/null 2>&1; then + echo "[INFO] EFA provider detected successfully" + else + echo "[INFO] EFA provider not available - skipping EFA configuration" + return 0 + fi + else + echo "[INFO] EFA tools not found - skipping EFA configuration" + return 0 + fi + + # Verify FSx EFA compatibility + if ! verify_fsx_efa_compatibility "$FSX_DNS_NAME"; then + echo "[INFO] FSx not EFA-compatible - skipping EFA configuration" + return 0 + fi + + echo "[INFO] EFA requirements met - proceeding with EFA configuration" + echo "[INFO] - EFA provider: available" + echo "[INFO] - FSx EFA enabled: yes" + echo "[INFO] - Same AZ: yes" + + # Download EFA configuration script + if ! ansible localhost -m ansible.builtin.get_url -a "url=https://docs.aws.amazon.com/fsx/latest/LustreGuide/samples/configure-efa-fsx-lustre-client.zip dest=/tmp/configure-efa-fsx-lustre-client.zip mode='0644'"; then + echo "[ERROR] Failed to download EFA configuration script" + return 1 + fi + + # Extract the zip file + ansible localhost -m ansible.builtin.unarchive -a "src=/tmp/configure-efa-fsx-lustre-client.zip dest=/tmp remote_src=yes" + + # Make script executable and run it + ansible localhost -b -m ansible.builtin.file -a "path=/tmp/configure-efa-fsx-lustre-client/setup.sh mode='0755'" + ansible localhost -b -m ansible.builtin.command -a "/tmp/configure-efa-fsx-lustre-client/setup.sh" + + # Cleanup + ansible localhost -m ansible.builtin.file -a "path=/tmp/configure-efa-fsx-lustre-client.zip state=absent" + ansible localhost -m ansible.builtin.file -a "path=/tmp/configure-efa-fsx-lustre-client/setup.sh state=absent" + + echo "[INFO] EFA configuration for FSx Lustre completed" +} + # Load lnet modules load_lnet_modules() { @@ -74,7 +185,7 @@ mount_fs() { fi echo "[STEP] Verifying mountpoint..." - if ! ansible localhost -b -m ansible.builtin.command -a "mountpoint $MOUNT_POINT"; then + if ! ansible localhost -b -m ansible.builtin.command -a "mountpoint $MOUNT_POINT"; then echo "[WARN] Mountpoint verification failed — retrying in $delay seconds" sleep "$delay"; ((attempt++)); continue fi @@ -112,16 +223,17 @@ restart_daemon() systemctl status fsx.automount } -main() +main() { verify_parameters echo "Mount_fsx called with fsx_dns_name: $FSX_DNS_NAME, fsx_mountname: $FSX_MOUNTNAME" echo "Using mount_point: $MOUNT_POINT" echo "LUSTRE CLIENT CONFIGURATION $(print_lustre_version)" + configure_efa_lustre load_lnet_modules mount_fs || exit 1 restart_daemon echo "FSx Lustre mounted successfully to $MOUNT_POINT" } -main "$@" \ No newline at end of file +main "$@"