Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# must be run as sudo

set -eux
set -eux

# FSx Lustre Endpoints
FSX_DNS_NAME="$1"
Expand Down Expand Up @@ -42,6 +42,117 @@ print_lustre_version()
modinfo lustre | grep 'version:' | head -n 1 | awk '{print $2}'
}

# Verify if FSxL is created with EFA-enabled and if the FS is in the same AZ (cross AZ is not supported)
verify_fsx_efa_compatibility()
{
local fsx_dns_name="$1"

echo "[INFO] Verifying FSx EFA compatibility"

# Extract FSx filesystem ID from DNS name
local fsx_id=$(echo "$fsx_dns_name" | cut -d'.' -f1)

# Get instance AZ
local instance_az
TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" -s --max-time 3 2>/dev/null)
if [[ -n "$TOKEN" ]]; then
instance_az=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s --max-time 3 http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null)
else
instance_az=$(curl -s --max-time 3 http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null)
fi

if [[ -z "$instance_az" ]]; then
echo "[WARN] Could not determine instance AZ - proceeding without EFA verification"
return 1
fi

# Get FSx filesystem details (EFA and Subnet details)
local fsx_info
if ! fsx_info=$(aws fsx describe-file-systems --file-system-ids "$fsx_id" --query 'FileSystems[0].{LustreConfiguration: LustreConfiguration, SubnetIds: SubnetIds}' --output json 2>/dev/null); then
echo "[WARN] Could not describe FSx filesystem - proceeding without EFA verification"
return 1
fi

# Get FSx AZ from subnet (To match FSx and instance AZ)
local fsx_subnet=$(echo "$fsx_info" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data['SubnetIds'][0])" 2>/dev/null)

if [[ -z "$fsx_subnet" ]]; then
echo "[WARN] Could not determine FSx subnet - proceeding without EFA verification"
return 1
fi

local fsx_az=$(aws ec2 describe-subnets --subnet-ids "$fsx_subnet" --query 'Subnets[0].AvailabilityZone' --output text 2>/dev/null)

if [[ "$instance_az" != "$fsx_az" ]]; then
echo "[INFO] FSx filesystem is in different AZ ($fsx_az vs $instance_az) - EFA not supported cross-AZ"
return 1
fi

# Check if FSx has EFA enabled (checking for EfaEnabled field and value. Currently, as observed, if FSx is created without EFA, the field doesn't exist in the describe call)
local efa_enabled=$(echo "$fsx_info" | python3 -c "import sys, json; data=json.load(sys.stdin); lustre_config=data.get('LustreConfiguration', {}); print('FieldNotPresent' if 'EfaEnabled' not in lustre_config else lustre_config['EfaEnabled'])" 2>/dev/null)

if [[ "$efa_enabled" != "True" ]]; then
if [[ "$efa_enabled" == "FieldNotPresent" ]]; then
echo "[INFO] FSx filesystem was not created with EFA enabled - skipping EFA configuration"
else
echo "[INFO] FSx filesystem has EFA disabled (EfaEnabled: $efa_enabled) - skipping EFA configuration"
fi
return 1
fi

echo "[INFO] FSx filesystem is EFA-compatible (same AZ: $instance_az, EfaEnabled: true)"
return 0
}

# Configure EFA for Lustre if supported
configure_efa_lustre()
{
echo "[INFO] Configuring EFA for FSx Lustre"

# Check if instance has EFA drivers installed and configured
if [[ -x "/opt/amazon/efa/bin/fi_info" ]]; then
if /opt/amazon/efa/bin/fi_info -p efa >/dev/null 2>&1; then
echo "[INFO] EFA provider detected successfully"
else
echo "[INFO] EFA provider not available - skipping EFA configuration"
return 0
fi
else
echo "[INFO] EFA tools not found - skipping EFA configuration"
return 0
fi

# Verify FSx EFA compatibility
if ! verify_fsx_efa_compatibility "$FSX_DNS_NAME"; then
echo "[INFO] FSx not EFA-compatible - skipping EFA configuration"
return 0
fi

echo "[INFO] EFA requirements met - proceeding with EFA configuration"
echo "[INFO] - EFA provider: available"
echo "[INFO] - FSx EFA enabled: yes"
echo "[INFO] - Same AZ: yes"

# Download EFA configuration script
if ! ansible localhost -m ansible.builtin.get_url -a "url=https://docs.aws.amazon.com/fsx/latest/LustreGuide/samples/configure-efa-fsx-lustre-client.zip dest=/tmp/configure-efa-fsx-lustre-client.zip mode='0644'"; then
echo "[ERROR] Failed to download EFA configuration script"
return 1
fi

# Extract the zip file
ansible localhost -m ansible.builtin.unarchive -a "src=/tmp/configure-efa-fsx-lustre-client.zip dest=/tmp remote_src=yes"

# Make script executable and run it
ansible localhost -b -m ansible.builtin.file -a "path=/tmp/configure-efa-fsx-lustre-client/setup.sh mode='0755'"
ansible localhost -b -m ansible.builtin.command -a "/tmp/configure-efa-fsx-lustre-client/setup.sh"

# Cleanup
ansible localhost -m ansible.builtin.file -a "path=/tmp/configure-efa-fsx-lustre-client.zip state=absent"
ansible localhost -m ansible.builtin.file -a "path=/tmp/configure-efa-fsx-lustre-client/setup.sh state=absent"

echo "[INFO] EFA configuration for FSx Lustre completed"
}

# Load lnet modules
load_lnet_modules()
{
Expand Down Expand Up @@ -74,7 +185,7 @@ mount_fs() {
fi

echo "[STEP] Verifying mountpoint..."
if ! ansible localhost -b -m ansible.builtin.command -a "mountpoint $MOUNT_POINT"; then
if ! ansible localhost -b -m ansible.builtin.command -a "mountpoint $MOUNT_POINT"; then
echo "[WARN] Mountpoint verification failed — retrying in $delay seconds"
sleep "$delay"; ((attempt++)); continue
fi
Expand Down Expand Up @@ -112,16 +223,17 @@ restart_daemon()
systemctl status fsx.automount
}

main()
main()
{
verify_parameters
echo "Mount_fsx called with fsx_dns_name: $FSX_DNS_NAME, fsx_mountname: $FSX_MOUNTNAME"
echo "Using mount_point: $MOUNT_POINT"
echo "LUSTRE CLIENT CONFIGURATION $(print_lustre_version)"
configure_efa_lustre
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as above, what if FSxL FileSystem is not EFA enabled?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the customer is using Non-EFA enabled FSxL, the comms fallback to TCP automatically. Installing EFA client on the instance has no drawback.

load_lnet_modules
mount_fs || exit 1
restart_daemon
echo "FSx Lustre mounted successfully to $MOUNT_POINT"
}

main "$@"
main "$@"