diff --git a/1.architectures/7.sagemaker-hyperpod-eks/create_config.sh b/1.architectures/7.sagemaker-hyperpod-eks/create_config.sh old mode 100755 new mode 100644 diff --git a/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/slinky_installation.sh b/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/slinky_installation.sh new file mode 100755 index 000000000..652754aec --- /dev/null +++ b/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/slinky_installation.sh @@ -0,0 +1,1126 @@ +#!/bin/bash + +# Workshop Automation Script +# This script automates the steps of the workshop by executing CLI commands + +# Exit immediately if a command exits with a non-zero status. Print commands and their arguments as executed +set -e + +#===Global=== +export AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query "Account" --output text) +export AWS_REGION=${AWS_DEFAULT_REGION:-$(aws configure get region)} +TOTAL_STEPS=5 +CURRENT_STEP=0 + +#===Style Definitions=== +GREEN='\033[0;32m' +BLUE='\033[0;34m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +# Function to print a yellow header +print_header() { + echo -e "\n${BLUE}=================================================${NC}" + echo -e "\n${YELLOW}==== $1 ====${NC}\n" + echo -e "\n${BLUE}=================================================${NC}" + +} + +# UX Function for a Progress Bar :) +progress_bar() { + local duration=$1 + local steps=$2 + local width=50 + local progress=0 + + for ((i=0; i /dev/null; then + echo -e "${YELLOW}⚠️ AWS CLI is not installed. Installing...${NC}" + install_aws_cli + else + echo -e "${GREEN}✅ AWS CLI found. Checking version...${NC}" + CLI_VERSION=$(aws --version | awk '{print $1}' | cut -d/ -f2) + + echo -e "${BLUE}Current version: ${YELLOW}$CLI_VERSION${NC}" + echo -e "${BLUE}Min. required version: ${YELLOW}2.17.1${NC}" + + if [[ "$(printf '%s\n' "2.17.1" "$CLI_VERSION" | sort -V | head -n1)" != "2.17.1" ]]; then + echo -e "${YELLOW}⚠️ AWS CLI version $CLI_VERSION is lower than required.${NC}" + echo -e "${YELLOW} Updating AWS CLI...${NC}" + install_aws_cli + else + echo -e "${GREEN}✅ AWS CLI version $CLI_VERSION is up to date.${NC}" + fi + fi + + echo -e "${BLUE}=== AWS CLI Check Complete ===${NC}\n" + +} + +# Function to check if Git is installed and configured +check_git() { + if ! command -v git &> /dev/null; then + echo "Git is not installed. Please install Git and try again." + exit 1 + fi +} + +# Function to display the prerequisites before starting this workshop +display_important_prereqs() { + echo -e "${BLUE}Before running this script, please ensure the following prerequisites:${NC}\n" + + echo -e "${GREEN}1. 🔑 IAM Credentials:${NC}" + echo " You have Administrator Access Credentials in IAM." + echo " This is crucial as we'll be using CloudFormation to create IAM roles and policies." + echo " Run 'aws configure' to set up your credentials." + + echo -e "\n${GREEN}2. Deploy Sagemaker Hyperpod on EKS stack using this link https://catalog.workshops.aws/sagemaker-hyperpod-eks/en-US/00-setup/00-workshop-infra-cfn ${NC}" + echo " Set the number of the GeneralPurposeInstanceCount at least to 2. " + echo " Make sure the cloufromation stack creation is successful, the eks and hyperpod clusters are \"Inservice\" status and the nodes are \"Running\" " + echo " (It may take up to an hour for DeepHealthChecks on the node to be finished and for the node to be in the \"running\" state)." + + echo -e "\n${GREEN}3. Build a Slurmd Deep Learning Container:${NC}" + echo " Build a Slurm DLC using this dockerfile: https://github.com/aws-samples/awsome-distributed-training/blob/feature/slinkly-slurm-hyperpod-eks/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/dlc-slurmd.Dockerfile " + echo " following this direction: https://github.com/aws-samples/awsome-distributed-training/blob/feature/slinkly-slurm-hyperpod-eks/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/Docker-Build-README.md" + + echo -e "\n${GREEN}4. 🔧 Packages required for this script to run:${NC}" + echo " Ensure you install the following: eksctl, kubectl, helm, jq, and yq (install:sudo wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq && sudo chmod +x /usr/bin/yq ) " + echo -e "\n${YELLOW}Ready to proceed? Press Enter to continue or Ctrl+C to exit...${NC}" + read +} + + +# Helper function to get user inputs with default values specified +get_input() { + local prompt="$1" + local default="$2" + local input + read -e -p "$prompt [$default]: " input + echo "${input:-$default}" +} + +get_prompt() { + local prompt="$1" + local input + read -e -p "$prompt: " input + echo "$input" +} +region_check() { + + NEW_REGION=$(get_input "Please, enter the AWS region where you want to set up your cluster" "$AWS_REGION") #eks cluster name + + if [[ -z "$NEW_REGION" ]]; then + echo -e "${GREEN}✅ Using default region: ${YELLOW}$AWS_REGION${NC}" + else + export AWS_REGION="$NEW_REGION" + echo -e "${GREEN}✅ Region updated to: ${YELLOW}$AWS_REGION${NC}" + fi + + echo -e "\n${BLUE}Your region is set to: ${YELLOW}$AWS_REGION${NC}" + echo -e "${BLUE}Ensure your chosen region supports SageMaker HyperPod.${NC}" + echo -e "${GREEN}You can check out https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-hyperpod.html#sagemaker-hyperpod-available-regions to learn about supported regions.${NC}" + echo -e "${BLUE}Press Enter to continue...${NC}" + read +} + + + +# Warning message function +warning() { + echo -e "${BLUE}⚠️ Please note:${NC}" + echo -e " - Cluster creation may take some time (~15-20 min)" + echo -e " - This operation may incur costs on your AWS account" + echo -e " - Ensure you understand the implications before proceeding\n" +} + +# Function to display goodbye message +goodbye() { + # Final goodbye message + echo -e "${GREEN}Thank you for using the SageMaker HyperPod Cluster Creation Script!${NC}" + echo -e "${GREEN}For any issues or questions, please refer to the AWS documentation.${NC}" + echo "https://docs.aws.amazon.com/sagemaker/latest/dg/smcluster-getting-started.html" + + # Exit message + echo -e "\n${BLUE}Exiting script. Good luck with your SageMaker HyperPod journey! 👋${NC}\n" +} + + +# Function to setup environment variables +setup_env_vars() { + + # Clear env_vars from previous runs + > env_vars + + echo -e "${YELLOW}Generating new environment variables...${NC}" + + # -------------------------- + # Write instance mappings + # -------------------------- + echo "export EKS_CLUSTER_NAME=${EKS_CLUSTER_NAME}" >> env_vars + echo "[INFO] EKS_CLUSTER_NAME = ${EKS_CLUSTER_NAME}" + echo "export ACCEL_INSTANCE_TYPE=${ACCEL_INSTANCE_TYPE}" >> env_vars + echo "export ACCEL_INSTANCE_COUNT=${ACCEL_INSTANCE_COUNT}" >> env_vars + # Export General Purpose Instance details without INFO messages + echo "export GEN_INSTANCE_TYPE=${GEN_INSTANCE_TYPE}" >> env_vars + echo "export GEN_INSTANCE_COUNT=${GEN_INSTANCE_COUNT}" >> env_vars + + EKS_CLUSTER_INFO=$(aws eks describe-cluster --name "$EKS_CLUSTER_NAME" --region "$AWS_REGION") #Eks cluster information + + # -------------------------- + # Get EKS_CLUSTER_ARN from CloudFormation + # -------------------------- + EKS_CLUSTER_ARN=$(aws cloudformation describe-stacks \ + --stack-name "$STACK_ID" \ + --region "$AWS_REGION" \ + --query 'Stacks[0].Outputs[?OutputKey==`EKSClusterArn`].OutputValue' \ + --output text) + + if [[ -n "$EKS_CLUSTER_ARN" && "$EKS_CLUSTER_ARN" != "None" ]]; then + echo "export EKS_CLUSTER_ARN=${EKS_CLUSTER_ARN}" >> env_vars + echo "[INFO] EKS_CLUSTER_ARN = ${EKS_CLUSTER_ARN}" + else + echo "[ERROR] Failed to retrieve EKS_CLUSTER_ARN from CloudFormation." + return 1 + fi + + # -------------------------- + # Get S3_BUCKET_NAME + # -------------------------- + S3_BUCKET_NAME=$(aws cloudformation describe-stacks \ + --stack-name "$STACK_ID" \ + --region "$AWS_REGION" \ + --query 'Stacks[0].Outputs[?OutputKey==`S3BucketName`].OutputValue' \ + --output text) + + if [[ -n "$S3_BUCKET_NAME" && "$S3_BUCKET_NAME" != "None" ]]; then + echo "export S3_BUCKET_NAME=${S3_BUCKET_NAME}" >> env_vars + echo "[INFO] S3_BUCKET_NAME = ${S3_BUCKET_NAME}" + else + echo "[ERROR] Failed to retrieve S3_BUCKET_NAME from CloudFormation." + return 1 + fi + + #SageMakerIAMRoleArn + EXECUTION_ROLE=$(aws cloudformation describe-stacks \ + --stack-name "$STACK_ID" \ + --region "$AWS_REGION" \ + --query 'Stacks[0].Outputs[?OutputKey==`SageMakerIAMRoleArn`].OutputValue' \ + --output text) + + + if [[ -n "$EXECUTION_ROLE" && "$EXECUTION_ROLE" != "None" ]]; then + echo "export EXECUTION_ROLE=${EXECUTION_ROLE}" >> env_vars + echo "[INFO] EXECUTION_ROLE = ${EXECUTION_ROLE}" + else + echo "[ERROR] Failed to retrieve EXECUTION_ROLE from CloudFormation." + return 1 + fi + + # -------------------------- + # Get VPC_ID + # -------------------------- + VPC_ID=$(aws cloudformation describe-stacks \ + --stack-name "$STACK_ID" \ + --region "$AWS_REGION" \ + --query 'Stacks[0].Outputs[?OutputKey==`VpcId`].OutputValue' \ + --output text) + + if [[ -n "$VPC_ID" && "$VPC_ID" != "None" ]]; then + echo "export VPC_ID=${VPC_ID}" >> env_vars + echo "[INFO] VPC_ID = ${VPC_ID}" + else + echo "[ERROR] Failed to retrieve VPC_ID from CloudFormation." + return 1 + fi + + #EKS Cluster subnet + + # -------------------------- + # Get PRIVATE_SUBNET_ID directly from EKS cluster + # -------------------------- + echo "[INFO] Retrieving subnet information from EKS cluster ${EKS_CLUSTER_NAME}..." + + # Get cluster VPC configuration + # Extract the first private subnet ID + export PRIVATE_SUBNET_ID=$(echo "$EKS_CLUSTER_INFO" | jq -r '.cluster.resourcesVpcConfig.subnetIds[0]') + + if [[ -n "$PRIVATE_SUBNET_ID" && "$PRIVATE_SUBNET_ID" != "null" ]]; then + echo "export PRIVATE_SUBNET_ID=${PRIVATE_SUBNET_ID}" >> env_vars + echo "[INFO] PRIVATE_SUBNET_ID = ${PRIVATE_SUBNET_ID}" + else + echo "[ERROR] Failed to retrieve PRIVATE_SUBNET_ID from EKS cluster." + return 1 + fi + + # -------------------------- + # Get SECURITY_GROUP_ID + # -------------------------- + SECURITY_GROUP_ID=$(aws cloudformation describe-stacks \ + --stack-name "$STACK_ID" \ + --region "$AWS_REGION" \ + --query 'Stacks[0].Outputs[?OutputKey==`SecurityGroupId`].OutputValue' \ + --output text) + + if [[ -n "$SECURITY_GROUP_ID" && "$SECURITY_GROUP_ID" != "None" ]]; then + echo "export SECURITY_GROUP_ID=${SECURITY_GROUP_ID}" >> env_vars + echo "[INFO] SECURITY_GROUP_ID = ${SECURITY_GROUP_ID}" + else + echo "[ERROR] Failed to retrieve SECURITY_GROUP_ID from CloudFormation." + return 1 + fi + + # -------------------------- + # Source the generated variables + # -------------------------- + source env_vars + + # Update kubectl config to point to the correct cluster + echo -e "${YELLOW}Updating kubectl configuration to use cluster: ${EKS_CLUSTER_NAME}${NC}" + aws eks update-kubeconfig --name $EKS_CLUSTER_NAME --region $AWS_REGION + + # -------------------------- + # Summary + # -------------------------- + echo -e "\n${BLUE}=== Environment Variables Summary ===${NC}" + echo -e "${GREEN}Current environment variables:${NC}" + cat env_vars + echo -e "\n${BLUE}=== Environment Setup Complete ===${NC}" +} + + +# Function to write the cluster-config.json file +create_config() { + + STACK_ID=$(get_input "Enter the name of the cloud formaiton stack created in step 2 of the prerequisite" "hyperpod-eks-full-stack") + #get the eks cluster name + EKS_CLUSTER_NAME=$(aws cloudformation describe-stacks \ + --stack-name "$STACK_ID" \ + --region "$AWS_REGION" \ + --query 'Stacks[0].Parameters[?ParameterKey==`EKSClusterName`].ParameterValue' \ + --output text) + + + ACCEL_INSTANCE_TYPE=$(aws cloudformation describe-stacks \ + --stack-name "$STACK_ID" \ + --region "$AWS_REGION" \ + --query 'Stacks[0].Parameters[?ParameterKey==`AcceleratedInstanceType`].ParameterValue' \ + --output text) + + ACCEL_INSTANCE_GROUP=$(aws cloudformation describe-stacks \ + --stack-name "$STACK_ID" \ + --region "$AWS_REGION" \ + --query 'Stacks[0].Parameters[?ParameterKey==`AcceleratedInstanceGroupName`].ParameterValue' \ + --output text) + + ACCEL_INSTANCE_COUNT=$(aws cloudformation describe-stacks \ + --stack-name "$STACK_ID" \ + --region "$AWS_REGION" \ + --query 'Stacks[0].Parameters[?ParameterKey==`AcceleratedInstanceCount`].ParameterValue' \ + --output text) + + GEN_INSTANCE_TYPE=$(aws cloudformation describe-stacks \ + --stack-name "$STACK_ID" \ + --region "$AWS_REGION" \ + --query 'Stacks[0].Parameters[?ParameterKey==`GeneralPurposeInstanceType`].ParameterValue' \ + --output text) + + GEN_INSTANCE_COUNT=$(aws cloudformation describe-stacks \ + --stack-name "$STACK_ID" \ + --region "$AWS_REGION" \ + --query 'Stacks[0].Parameters[?ParameterKey==`GeneralPurposeInstanceCount`].ParameterValue' \ + --output text) + + HP_CLUSTER_NAME=$(aws cloudformation describe-stacks \ + --stack-name "$STACK_ID" \ + --region "$AWS_REGION" \ + --query 'Stacks[0].Parameters[?ParameterKey==`HyperPodClusterName`].ParameterValue' \ + --output text) + + #sourcing the env var + setup_env_vars #sets and sources the env variables +} + +# Function to create FSx for Lustre Storage Class +create_fsx_lustre_storage_class() +{ + echo + echo -e "${BLUE}=== Creating FSx for Lustre Storage Class ===${NC}" + + FSX_SERVICE_ACCOUNT_NAME="fsx-csi-controller-sa-${EKS_CLUSTER_NAME}" + FSX_ROLE_NAME="FSXLCSI-${EKS_CLUSTER_NAME}-${AWS_REGION}" + + # Create an IAM OpenID Connect (OIDC) identity provider for the cluster + echo -e "${YELLOW}Creating IAM OIDC identity provider...${NC}" + eksctl utils associate-iam-oidc-provider --cluster $EKS_CLUSTER_NAME --approve + # Create a service account with an IAM role for the FSx for Lustre CSI driver + + # Wait a moment for cleanup + + echo -e "${YELLOW}Creating service account with IAM role for use with FSx for Lustre CSI driver...(${FSX_SERVICE_ACCOUNT_NAME})${NC}" + + eksctl create iamserviceaccount \ + --name ${FSX_SERVICE_ACCOUNT_NAME} \ + --namespace kube-system \ + --cluster $EKS_CLUSTER_NAME \ + --attach-policy-arn arn:aws:iam::aws:policy/AmazonFSxFullAccess \ + --approve \ + --role-name ${FSX_ROLE_NAME} \ + --region $AWS_REGION + + # Verify service account annotation + echo -e "${YELLOW}Verifying service account annotation...${NC}" + kubectl get sa ${FSX_SERVICE_ACCOUNT_NAME} -n kube-system -oyaml #retrives information about theFSXLCSI-${EKS_CLUSTER_NAME}-${AWS_REGION}" service account + + echo -e "${YELLOW} Adding the FSx for Lustre CSI Driver to helm repos...${NC}" + # Check if repo already exists before adding it + if ! helm repo list | grep -q "aws-fsx-csi-driver"; then + helm repo add aws-fsx-csi-driver https://kubernetes-sigs.github.io/aws-fsx-csi-driver + else + echo -e "${YELLOW}Helm repository aws-fsx-csi-driver already exists, skipping add...${NC}" + fi + + # Check if FSx CSI driver pods exist + if kubectl get pods -n kube-system -l app.kubernetes.io/name=aws-fsx-csi-driver 2>/dev/null | grep -q .; then + echo -e "${YELLOW}Existing FSx CSI driver pods found. Cleaning up...${NC}" + + # Show existing pods before cleanup + echo -e "${YELLOW}Current FSx CSI driver pods:${NC}" + kubectl get pods -n kube-system -l app.kubernetes.io/name=aws-fsx-csi-driver + + # Delete the helm release + echo -e "${YELLOW}Uninstalling existing FSx CSI driver...${NC}" + helm uninstall aws-fsx-csi-driver -n kube-system + + # Delete any remaining pods (backup cleanup) + kubectl delete pods -n kube-system -l app.kubernetes.io/name=aws-fsx-csi-driver 2>/dev/null || true + + # Wait a moment for cleanup + echo -e "${YELLOW}Waiting for pods to be cleaned up...${NC}" + sleep 10 + else + echo -e "${YELLOW}No existing FSx CSI driver pods found. Proceeding with installation...${NC}" + fi + + echo -e "${YELLOW}Installing the FSx for Lustre CSI driver...${NC}" + helm repo update + helm upgrade --install aws-fsx-csi-driver \ + --namespace kube-system \ + --set controller.serviceAccount.create=false \ + --set controller.serviceAccount.name=${FSX_SERVICE_ACCOUNT_NAME} \ + aws-fsx-csi-driver/aws-fsx-csi-driver + + # Verify installation of the FSx for Lustre CSI driver + echo -e "${YELLOW}Verifying FSx for Lustre CSI driver installation...${NC}" + kubectl get pods -n kube-system -l app.kubernetes.io/name=aws-fsx-csi-driver + + # Install the FSx for Lustre Storage Class using Helm + echo -e "${YELLOW}Installing FSx for Lustre Storage Class...${NC}" + cat > /tmp/lustre-storageclass.yaml << EOL +kind: StorageClass +apiVersion: storage.k8s.io/v1 +metadata: + name: fsx-sc +provisioner: fsx.csi.aws.com +parameters: + subnetId: \${PRIVATE_SUBNET_ID} + securityGroupIds: \${SECURITY_GROUP_ID} + deploymentType: PERSISTENT_2 + automaticBackupRetentionDays: "0" + copyTagsToBackups: "true" + perUnitStorageThroughput: "250" + dataCompressionType: "LZ4" + fileSystemTypeVersion: "2.15" +mountOptions: + - flock +EOL + + # Create an FSx for Lustre storage class + echo -e "Creating FSx for Lustre storage class..." + envsubst < /tmp/lustre-storageclass.yaml | kubectl apply -f - + + # Verify the storage class was created + echo -e "${YELLOW}Verifying storage class creation...${NC}" + kubectl get sc fsx-sc -oyaml + + # Clean up the temporary YAML file + rm -f /tmp/lustre-storageclass.yaml + + echo -e "${GREEN}✅ FSx for Lustre Storage Class setup completed${NC}" + echo +} + + +install_aws_load_balancer_controller() +{ + echo -e "${BLUE}=== Installing AWS Load Balancer Controller ===${NC}" + + # Create the IAM policy + echo -e "${YELLOW}Creating IAM policy for AWS Load Balancer Controller...${NC}" + curl -O https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/v2.13.0/docs/install/iam_policy.json + + # Check if policy already exists + if ! aws iam create-policy \ + --policy-name AWSLoadBalancerControllerIAMPolicy-v2.13.0 \ + --policy-document file://iam_policy.json 2>/dev/null; then + echo -e "${YELLOW}Policy AWSLoadBalancerControllerIAMPolicy-v2.13.0 already exists, continuing...${NC}" + fi + + # Create a service account with IAM role + echo -e "${YELLOW}Creating service account with IAM role...${NC}" + eksctl create iamserviceaccount \ + --cluster=$EKS_CLUSTER_NAME \ + --namespace=kube-system \ + --name=aws-load-balancer-controller \ + --attach-policy-arn=arn:aws:iam::$AWS_ACCOUNT_ID:policy/AWSLoadBalancerControllerIAMPolicy-v2.13.0 \ + --override-existing-serviceaccounts \ + --region $AWS_REGION \ + --approve + + # Verify service account annotation + echo -e "${YELLOW}Verifying Load balance contoller service account annotation (aws-load-balancer-controller) ${NC}" + kubectl get sa aws-load-balancer-controller -n kube-system -oyaml + + # Install AWS Load Balancer Controller using Helm + echo -e "${YELLOW}Installing AWS Load Balancer Controller using Helm...${NC}" + + # Check if repo already exists before adding it + if ! helm repo list | grep -q "eks"; then + helm repo add eks https://aws.github.io/eks-charts + else + echo -e "${YELLOW}Helm repository eks already exists, skipping add...${NC}" + fi + + helm repo update + + # Check if the release already exists + if helm list -n kube-system | grep -q "aws-load-balancer-controller"; then + echo -e "${YELLOW}AWS Load Balancer Controller already exists, upgrading...${NC}" + helm upgrade aws-load-balancer-controller eks/aws-load-balancer-controller \ + -n kube-system \ + --set clusterName=$EKS_CLUSTER_NAME \ + --set serviceAccount.create=false \ + --set serviceAccount.name=aws-load-balancer-controller \ + --set region=$AWS_REGION \ + --set vpcId=$VPC_ID + else + echo -e "${YELLOW}Installing AWS Load Balancer Controller...${NC}" + helm install aws-load-balancer-controller eks/aws-load-balancer-controller \ + -n kube-system \ + --set clusterName=$EKS_CLUSTER_NAME \ + --set serviceAccount.create=false \ + --set serviceAccount.name=aws-load-balancer-controller \ + --set region=$AWS_REGION \ + --set vpcId=$VPC_ID + fi + + # Verify installation + echo -e "${YELLOW}Verifying AWS Load Balancer Controller installation...${NC}" + kubectl get pods -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller + + # Clean up the policy file + rm -f iam_policy.json + + echo -e "${GREEN}✅ AWS Load Balancer Controller installation completed${NC}" + echo +} + +install_slinky_prerequisites() { + echo -e "${BLUE}=== Installing Slinky Prerequisites ===${NC}" + + # Add Helm repositories + echo -e "${YELLOW}Adding Helm repositories...${NC}" + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + helm repo add metrics-server https://kubernetes-sigs.github.io/metrics-server/ + helm repo add bitnami https://charts.bitnami.com/bitnami + helm repo add jetstack https://charts.jetstack.io + + helm repo update + + MAX_RETRIES=30 + RETRY_INTERVAL=5 + READY=false + ATTEMPT=1 + + while [[ "$READY" == "false" ]] && [[ $ATTEMPT -le $MAX_RETRIES ]]; do + # Check if deployment is available + AVAILABLE=$(kubectl get deployment aws-load-balancer-controller -n kube-system -o jsonpath='{.status.conditions[?(@.type=="Available")].status}' 2>/dev/null) + if [[ "$AVAILABLE" == "True" ]]; then + kubectl get pods -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller + echo -e "${GREEN}✅ AWS Load Balancer Controller is available${NC}" + READY=true + break + fi + echo -e "${YELLOW}Waiting for AWS Load Balancer Controller to be available (attempt $ATTEMPT/$MAX_RETRIES)...${NC}" + sleep $RETRY_INTERVAL + ((ATTEMPT++)) + done + + if [[ "$READY" == "false" ]]; then + echo -e "${YELLOW}⚠️ AWS Load Balancer Controller not ready after waiting. Temporarily disabling webhook...${NC}" + kubectl delete -A ValidatingWebhookConfiguration aws-load-balancer-webhook --ignore-not-found=true + fi + + #V4 + if [[ "$READY" == "true" ]]; then + echo -e "${YELLOW}Installing cert-manager...${NC}" + if ! helm list -n cert-manager | grep -q "cert-manager"; then + echo -e "${YELLOW}Starting cert-manager installation...${NC}" + + # Install without waiting for the startup check to complete + helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --create-namespace \ + --set crds.enabled=true \ + --set startupapicheck.enabled=false \ + --timeout 10m || true + + echo -e "${YELLOW}Waiting for cert-manager pods to be ready...${NC}" + kubectl wait --for=condition=ready pod -l app.kubernetes.io/instance=cert-manager -n cert-manager --timeout=5m || true + + # Check if installation succeeded + if helm list -n cert-manager | grep -q "cert-manager"; then + echo -e "${GREEN}✅ Cert-manager installed successfully${NC}" + kubectl get pods -n cert-manager + else + echo -e "${RED}⚠️ Cert-manager installation may have issues${NC}" + fi + else + echo -e "${YELLOW}cert-manager already exists, skipping installation...${NC}" + fi + else + echo -e "${RED}Cannot install cert-manager as Load Balancer Controller is not ready${NC}" + fi + + + + # Install Prometheus + echo -e "${YELLOW}Installing Prometheus...${NC}" + if ! helm list -n prometheus | grep -q "prometheus"; then + helm install prometheus prometheus-community/kube-prometheus-stack \ + --namespace prometheus --create-namespace --set installCRDs=true + else + echo -e "${YELLOW}Prometheus already exists, skipping installation...${NC}" + fi + + # Verify installations + echo -e "${YELLOW}Verifying prerequisite installations...${NC}" + kubectl get all -n cert-manager + kubectl get all -n prometheus + + # Install Slurm Operator + echo -e "${BLUE}=== Installing Slurm Operator ===${NC}" + + # Download values file + echo -e "${YELLOW}Downloading Slurm Operator values file...${NC}" + curl -L https://raw.githubusercontent.com/SlinkyProject/slurm-operator/refs/tags/v0.3.0/helm/slurm-operator/values.yaml \ + -o values-operator.yaml + + # Delete any stale CRDs + echo -e "${YELLOW}Cleaning up any stale CRDs...${NC}" + kubectl delete crd clusters.slinky.slurm.net 2>/dev/null || true + kubectl delete crd nodesets.slinky.slurm.net 2>/dev/null || true + + # Install Slurm Operator + echo -e "${YELLOW}Installing Slurm Operator...${NC}" + if helm list -n slinky | grep -q "slurm-operator"; then + echo -e "${YELLOW}Existing Slurm Operator found. Uninstalling...${NC}" + helm uninstall slurm-operator -n slinky + # Wait for the resources to be cleaned up + sleep 10 + fi + + echo -e "${YELLOW}Installing Slurm Operator...${NC}" + helm install slurm-operator oci://ghcr.io/slinkyproject/charts/slurm-operator \ + --values=values-operator.yaml \ + --version=0.3.0 \ + --namespace=slinky \ + --create-namespace \ + --set installCRDs=true \ + --set webhook.installCRDs=true + + # Verify Slurm Operator installation + echo -e "${YELLOW}Verifying Slurm Operator installation...${NC}" + kubectl get all -n slinky + + # Clean up values file + rm -f values-operator.yaml + + echo -e "${GREEN}✅ Slinky prerequisites installation completed${NC}" +} + + + +# Function to create cluster +set_slurm_values() { + echo -e "${BLUE}=== Setting Slurm Cluster Values ===${NC}" + + # Use the environment variables + echo -e "${YELLOW}Using environment variables:${NC}" + echo -e "Accelerated instance type: ${GREEN}$ACCEL_INSTANCE_TYPE${NC}" + echo -e "Accelerated instance count: ${GREEN}$ACCEL_INSTANCE_COUNT${NC}" + echo -e "General purpose instance type: ${GREEN}$GEN_INSTANCE_TYPE${NC}" + + # Download base values file (using g5 as a template) + echo -e "${YELLOW}Downloading base values file...${NC}" + export VALUES_FILE="custom-values.yaml" + curl -L https://github.com/aws-samples/awsome-distributed-training/raw/refs/heads/feature/slinkly-slurm-hyperpod-eks/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/g5/g5-values.yaml -o $VALUES_FILE + if [[ $? -ne 0 ]]; then + echo -e "${BLUE}Failed to download base values file.${NC}" + exit 1 + fi + + # Verify general purpose nodes + echo -e "${YELLOW}Verifying general purpose nodes with instance type: $GEN_INSTANCE_TYPE${NC}" + kubectl get nodes -l node.kubernetes.io/instance-type=$GEN_INSTANCE_TYPE + + # Verify compute nodes + echo -e "${YELLOW}Verifying compute nodes with instance type: $ACCEL_INSTANCE_TYPE${NC}" + kubectl get nodes -l node.kubernetes.io/instance-type=$ACCEL_INSTANCE_TYPE + + # Automatically detect available dlc-slurmd image from ECR + echo -e "${YELLOW}Detecting available dlc-slurmd image from ECR...${NC}" + + # Get the latest image tag from ECR repository + AVAILABLE_TAG=$(aws ecr list-images --repository-name dlc-slurmd --region $AWS_REGION --query 'imageIds[0].imageTag' --output text 2>/dev/null) + + if [[ "$AVAILABLE_TAG" == "None" ]] || [[ -z "$AVAILABLE_TAG" ]]; then + echo -e "${RED}No dlc-slurmd images found in ECR repository${NC}" + echo -e "${YELLOW}Falling back to public image: ghcr.io/slinkyproject/slurmd:25.05-ubuntu24.04${NC}" + CONTAINER_IMAGE="ghcr.io/slinkyproject/slurmd:25.05-ubuntu24.04" + CONTAINER_REPO="ghcr.io/slinkyproject/slurmd" + else + echo -e "${GREEN}Found image tag: $AVAILABLE_TAG${NC}" + CONTAINER_IMAGE="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/dlc-slurmd:$AVAILABLE_TAG" + CONTAINER_REPO="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/dlc-slurmd" + fi + + echo -e "${YELLOW}Using container image: ${GREEN}$CONTAINER_IMAGE${NC}" + + # Generate SSH key if needed + echo -e "${YELLOW}Checking for SSH key...${NC}" + if [[ ! -f ~/.ssh/id_rsa.pub ]]; then + echo -e "${YELLOW}No SSH key found. Generating new SSH key...${NC}" + ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa + else + echo -e "${GREEN}Using existing SSH key at ~/.ssh/id_rsa.pub${NC}" + fi + + # Get SSH public key + SSH_PUBLIC_KEY=$(cat ~/.ssh/id_rsa.pub) + + # Update values file with user's configuration + echo -e "${YELLOW}Customizing values file with your configuration...${NC}" + + # Update common affinity for non-compute components to use general purpose instance type + yq eval ".commonAffinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[0] = \"$GEN_INSTANCE_TYPE\"" -i $VALUES_FILE + + # Update compute node configuration + echo -e "${YELLOW}Updating compute node configuration...${NC}" + + # Update container image - repository + yq eval ".compute.nodesets[0].image.repository = \"$CONTAINER_REPO\"" -i $VALUES_FILE + + # Update image tag if using ECR + if [[ "$CONTAINER_REPO" == *"ecr"* ]]; then + yq eval ".compute.nodesets[0].image.tag = \"$AVAILABLE_TAG\"" -i $VALUES_FILE + fi + + # Update SSH public key + yq eval ".login.rootSshAuthorizedKeys[0] = \"$SSH_PUBLIC_KEY\"" -i $VALUES_FILE + + # Update node count to match the accelerated instance count + yq eval ".compute.nodesets[0].replicas = $ACCEL_INSTANCE_COUNT" -i $VALUES_FILE + + # Update node selector to match the accelerated instance type + yq eval ".compute.nodesets[0].nodeSelector.\"node.kubernetes.io/instance-type\" = \"$ACCEL_INSTANCE_TYPE\"" -i $VALUES_FILE + + # Remove OpenZFS configurations + yq eval 'del(.login.extraVolumeMounts[] | select(.name == "fsx-openzfs"))' -i $VALUES_FILE + yq eval 'del(.login.extraVolumes[] | select(.name == "fsx-openzfs"))' -i $VALUES_FILE + yq eval 'del(.compute.nodesets[].extraVolumeMounts[] | select(.name == "fsx-openzfs"))' -i $VALUES_FILE + yq eval 'del(.compute.nodesets[].extraVolumes[] | select(.name == "fsx-openzfs"))' -i $VALUES_FILE + + # Check if general purpose node has capacity, if not remove restrictive affinity + GEN_NODE_CAPACITY=$(kubectl get nodes -l node.kubernetes.io/instance-type=$GEN_INSTANCE_TYPE -o jsonpath='{.items[0].status.allocatable.pods}' 2>/dev/null || echo "0") + GEN_NODE_USED=$(kubectl get pods --all-namespaces --field-selector spec.nodeName=$(kubectl get nodes -l node.kubernetes.io/instance-type=$GEN_INSTANCE_TYPE -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) 2>/dev/null | wc -l || echo "0") + + if [[ $GEN_NODE_USED -ge $GEN_NODE_CAPACITY ]] && [[ $GEN_NODE_CAPACITY -gt 0 ]]; then + echo -e "${YELLOW}General purpose node is at capacity ($GEN_NODE_USED/$GEN_NODE_CAPACITY), removing restrictive affinity...${NC}" + yq eval 'del(.commonAffinity)' -i $VALUES_FILE + else + echo -e "${GREEN}General purpose node has capacity, keeping affinity rules${NC}" + fi + + dynamic_pods_allocation #this function is not tested + + echo -e "\n${BLUE}=== Configuration Parameters ===${NC}" + echo -e "${YELLOW}Please review the following configuration:${NC}" + echo "----------------------------------------" + yq eval '... comments=""' custom-values.yaml + echo "----------------------------------------" + + echo -e "\n${YELLOW}Please verify if these values look correct.${NC}" + read + echo -e "${GREEN}✓ Slurm values have been successfully configured!${NC}" +} + +get_gpu_info() { + local ec2_type=${ACCEL_INSTANCE_TYPE#ml.} + local gpu_count=$(aws ec2 describe-instance-types --instance-types "${ec2_type}" \ + --query 'InstanceTypes[0].GpuInfo.Gpus[0].Count' --output text) + echo "${gpu_count}" +} + +has_efa_support() { + local ec2_type=${ACCEL_INSTANCE_TYPE#ml.} + local efa_supported=$(aws ec2 describe-instance-types --instance-types "${ec2_type}" \ + --query 'InstanceTypes[0].NetworkInfo.EfaSupported' --output text) + echo "${efa_supported}" +} + +calculate_pods_per_node() { + local gpu_count=$1 + if [ ${gpu_count} -le 4 ]; then + echo 1 # One pod for nodes with less than 4 GPUs + elif [ ${gpu_count} -ge 8 ]; then + echo 2 # Two pods for nodes with 8 or more GPUs + else + echo 1 + fi +} + +calculate_gpus_per_pod() { + local gpu_count=$1 + local pods_per_node=$2 + echo $(( gpu_count / pods_per_node )) +} + +dynamic_pods_allocation(){ + #dynamic allocation of pods per node based on the instance type + local gpu_count=get_gpu_info + + local efa_info=$(get_efa_info) + local efa_supported=$(echo "${efa_info}" | jq -r '.EfaSupported') + local max_efa=$(echo "${efa_info}" | jq -r '.MaxEfa') + + local pods_per_node=$(calculate_pods_per_node ${gpu_count}) #returns 1 or 2 based on the gpu count + local gpus_per_pod=$(calculate_gpus_per_pod ${gpu_count} ${pods_per_node}) # diveds the total number of gpus in in an instance by the number of pods + + local total_rep = $(( ACCEL_INSTANCE_COUNT * pods_per_node )) + local resources_path=".compute.nodesets[0]" + + #number of replicas = the total number of pods + # Update replicas to match pods per node + yq eval "${resources_path}.replicas = ${total_rep}" -i "${values_file}" + + # Clear existing resources configuration + yq eval "${resources_path}.resources = {}" -i "${values_file}" + + # Set GPU resources + yq eval "${resources_path}.resources.limits.\"nvidia.com/gpu\" = ${gpus_per_pod}" -i "${values_file}" + yq eval "${resources_path}.resources.requests.\"nvidia.com/gpu\" = ${gpus_per_pod}" -i "${values_file}" + + # Add EFA configuration for p5 instances + if [ "${efa_supported}" == "true" ] && [ "${max_efa}" -gt 0 ]; then + local efa_per_pod=$(( max_efa / pods_per_node )) + yq eval "${resources_path}.resources.limits.\"vpc.amazonaws.com/efa\" = ${efa_per_pod}" -i "${values_file}" + yq eval "${resources_path}.resources.requests.\"vpc.amazonaws.com/efa\" = ${efa_per_pod}" -i "${values_file}" + fi + echo "Configuration set for ${instance_type}:" + echo "- GPUs per node: ${gpu_count}" + echo "- Pods per node: ${pods_per_node}" + echo "- GPUs per pod: ${gpus_per_pod}" + # +} + +create_and_verify_fsx_pvc() { + local namespace="slurm" + local pvc_name="fsx-claim" + local max_retries=30 + local retry_interval=10 + + echo "Creating FSx for Lustre PVC in ${namespace} namespace..." + + # Create namespace if it doesn't exist + if ! kubectl get namespace ${namespace} >/dev/null 2>&1; then + echo "Creating namespace: ${namespace}" + kubectl create ns ${namespace} + if [ $? -ne 0 ]; then + echo "Failed to create namespace ${namespace}" + return 1 + fi + fi + + local yaml_file="lustre-pvc-slurm.yaml" + local yaml_url="https://github.com/aws-samples/awsome-distributed-training/raw/refs/heads/feature/slinkly-slurm-hyperpod-eks/1.architectures/7.sagemaker-hyperpod-eks/slinky-slurm/lustre-pvc-slurm.yaml" + + if [ ! -f "${yaml_file}" ]; then + echo "PVC YAML file not found. Downloading from repository..." + if ! curl -s -L -o "${yaml_file}" "${yaml_url}"; then + echo "Failed to download ${yaml_file}" + return 1 + fi + echo "Successfully downloaded ${yaml_file}" + else + echo "Using existing ${yaml_file}" + fi + + # Apply the PVC configuration + echo "Creating PVC ${pvc_name}..." + kubectl apply -f "${yaml_file}" + if [ $? -ne 0 ]; then + echo "Failed to apply PVC configuration" + return 1 + fi + + # Wait for PVC to be bound + echo "Waiting for PVC to be bound..." + + seconds=0 + timeout=600 # 10 minutes + retry_interval=60 # 1 minute + + while [ $seconds -lt $timeout ]; do + status=$(kubectl get pvc ${pvc_name} -n ${namespace} -ojson | jq -r .status.phase) + + if [ "$status" == "Bound" ]; then + echo "PVC successfully bound!" + break + fi + + remaining=$((timeout - seconds)) + echo "Current status: ${status}, waiting ${retry_interval} seconds... (${remaining} seconds remaining)" + sleep ${retry_interval} + seconds=$((seconds + retry_interval)) + done + + if [ $seconds -ge $timeout ]; then + echo "Timeout of ${timeout} seconds reached waiting for PVC to be bound." + return 1 + fi + + # Get and display PVC details + echo "PVC Details:" + kubectl get pvc -n ${namespace} + + # Get volume ID + volume_name=$(kubectl get pvc ${pvc_name} -n ${namespace} -ojson | jq -r .spec.volumeName) + if [ -n "$volume_name" ]; then + volume_id=$(kubectl get pv ${volume_name} -ojson | jq -r .spec.csi.volumeHandle) + echo "Volume ID: ${volume_id}" + else + echo "Failed to get volume name" + return 1 + fi + + return 0 +} + + +# Function to deploy Slurm cluster +deploy_slurm_cluster() { + local namespace="${1:-slurm}" + local values_file="${2:-custom-values.yaml}" + local version="${3:-0.3.0}" + local dry_run="${4:-false}" + local configure_nlb="${5:-false}" + + echo -e "${BLUE}=== Deploying Slurm Cluster ===${NC}" + + # Verify the values file exists + if [[ ! -f "$values_file" ]]; then + echo -e "${RED}Error: Values file $values_file not found${NC}" + return 1 + fi + + # Perform dry run if requested + if [[ "$dry_run" == "true" ]]; then + echo -e "${YELLOW}Performing dry run installation...${NC}" + helm install --dry-run slurm oci://ghcr.io/slinkyproject/charts/slurm \ + --values="$values_file" --version="$version" --namespace="$namespace" + + # Check if dry run was successful + if [[ $? -ne 0 ]]; then + echo -e "${RED}Dry run failed. Please check the values file and try again.${NC}" + return 1 + fi + echo -e "${GREEN}Dry run completed successfully.${NC}" + + # Don't proceed further if this is just a dry run + if [[ "$dry_run" == "true" ]]; then + return 0 + fi + fi + + # Create namespace if it doesn't exist + if ! kubectl get namespace "$namespace" &>/dev/null; then + echo -e "${YELLOW}Creating namespace $namespace...${NC}" + kubectl create namespace "$namespace" + fi + + # Perform actual installation + echo -e "${YELLOW}Installing Slurm cluster...${NC}" + helm install slurm oci://ghcr.io/slinkyproject/charts/slurm \ + --values="$values_file" --version="$version" --namespace="$namespace" + + # Check if installation was successful + if [[ $? -ne 0 ]]; then + echo -e "${RED}Installation failed. Please check the error messages above.${NC}" + return 1 + fi + + echo -e "${GREEN}✅ Slurm cluster installation initiated${NC}" + + # Watch the deployment status + echo -e "${YELLOW}Watching deployment status...${NC}" + kubectl -n "$namespace" get pods -l app.kubernetes.io/instance=slurm --watch & + watch_pid=$! + # Allow user to stop watching after a while + sleep 15 + kill $watch_pid 2>/dev/null + echo -e "\n${YELLOW}Continuing with deployment...${NC}" + # Verify the deployment status of all components + echo -e "${YELLOW}Verifying deployment status of all components...${NC}" + kubectl get all -n "$namespace" + + echo -e "${GREEN}✅ Slurm cluster deployment completed${NC}" + echo -e "${YELLOW}Note: It may take a few minutes for all components to start up${NC}" + + # Configure NLB if requested + if [[ "$configure_nlb" == "true" ]]; then + echo -e "${YELLOW}Configuring Network Load Balancer for login access...${NC}" + # Wait a bit for the service to be created + sleep 10 + configure_login_nlb "$namespace" "slurm-login" + fi + + return 0 +} + +# Function to configure a Login Network Load Balancer +configure_login_nlb() { + local namespace="${1:-slurm}" + local service_name="${2:-slurm-login}" + + echo -e "${BLUE}=== Configuring Login Network Load Balancer ===${NC}" + + # Identify public subnets in the VPC + echo -e "${YELLOW}Identifying public subnets in VPC...${NC}" + export PUBLIC_SUBNET_ID_1=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=${VPC_ID}" "Name=map-public-ip-on-launch,Values=true" --query "Subnets[0].SubnetId" --output text) + export PUBLIC_SUBNET_ID_2=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=${VPC_ID}" "Name=map-public-ip-on-launch,Values=true" --query "Subnets[1].SubnetId" --output text) + + # Verify subnets were found + if [[ -z "$PUBLIC_SUBNET_ID_1" || "$PUBLIC_SUBNET_ID_1" == "None" || -z "$PUBLIC_SUBNET_ID_2" || "$PUBLIC_SUBNET_ID_2" == "None" ]]; then + echo -e "${RED}Error: Could not find two public subnets in VPC ${VPC_ID}${NC}" + return 1 + fi + + echo -e "${GREEN}Found public subnets: ${PUBLIC_SUBNET_ID_1}, ${PUBLIC_SUBNET_ID_2}${NC}" + + # Add annotations to the service to make it internet facing + echo -e "${YELLOW}Adding annotations to ${service_name} service...${NC}" + kubectl annotate service ${service_name} -n ${namespace} \ + service.beta.kubernetes.io/aws-load-balancer-type="nlb" \ + service.beta.kubernetes.io/aws-load-balancer-scheme="internet-facing" \ + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type="ip" \ + service.beta.kubernetes.io/aws-load-balancer-subnets="${PUBLIC_SUBNET_ID_1},${PUBLIC_SUBNET_ID_2}" \ + service.beta.kubernetes.io/aws-load-balancer-healthcheck-port="22" \ + --overwrite + + # Verify the service configuration + echo -e "${YELLOW}Verifying service configuration...${NC}" + kubectl describe service ${service_name} -n ${namespace} + + # Get the NLB DNS name + NLB_DNS=$(kubectl get service ${service_name} -n ${namespace} -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') + + if [[ -n "$NLB_DNS" ]]; then + echo -e "${GREEN}✅ Login NLB configured successfully${NC}" + echo -e "${GREEN}✅ You can access the Slurm login node using:${NC}" + echo -e "${YELLOW}ssh -i ~/.ssh/id_rsa @${NLB_DNS}${NC}" + else + echo -e "${YELLOW}NLB DNS name not yet available. It may take a few minutes to provision.${NC}" + echo -e "${YELLOW}Run the following command later to get the DNS name:${NC}" + echo -e "${YELLOW}kubectl get service ${service_name} -n ${namespace} -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'${NC}" + fi + + return 0 +} + + +#===Main Script=== +main() { + print_header "🚀 Welcome to the SageMaker HyperPod Slurm Cluster Creation Script! 🚀" + + # Prerequisites + display_important_prereqs + + # Checking AWS Account ID + echo -e "\n${BLUE}🔍 AWS Account Verification${NC}" + echo -e "Your AWS Account ID is: ${GREEN}$AWS_ACCOUNT_ID${NC}" + echo "Press Enter to confirm ✅ or Ctrl+C to exit❌..." + read + + # Checking Git installation + check_git + + # Checking AWS CLI version and installation + echo -e "\n${BLUE}📦 1a: AWS CLI Installation and Verification${NC}" + check_and_install_aws_cli + + # Checking Region + echo -e "\n${BLUE}🌎 AWS Region Configuration${NC}" + region_check + # Cluster Configuration + echo -e "${BLUE} Generating cluster configuration...${NC}" + create_config + create_fsx_lustre_storage_class + install_aws_load_balancer_controller + install_slinky_prerequisites + set_slurm_values + create_and_verify_fsx_pvc + deploy_slurm_cluster "slurm" "custom-values.yaml" "0.3.0" "false" "true" + goodbye +} +# Execute the main function +main \ No newline at end of file